diff options
Diffstat (limited to 'xlators/performance')
57 files changed, 18731 insertions, 16746 deletions
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am index e91d5f6efc8..e95725acb8c 100644 --- a/xlators/performance/Makefile.am +++ b/xlators/performance/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read stat-prefetch +SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache \ + quick-read md-cache open-behind nl-cache CLEANFILES = diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am index 6dd270e8ffc..bfa34ce5502 100644 --- a/xlators/performance/io-cache/src/Makefile.am +++ b/xlators/performance/io-cache/src/Makefile.am @@ -1,14 +1,17 @@ xlator_LTLIBRARIES = io-cache.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_cache_la_LDFLAGS = -module -avoidversion +io_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-cache.h ioc-mem-types.h +noinst_HEADERS = io-cache.h ioc-mem-types.h io-cache-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(CONTRIBDIR)/rbtree -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(CONTRIBDIR)/rbtree + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/io-cache/src/io-cache-messages.h b/xlators/performance/io-cache/src/io-cache-messages.h new file mode 100644 index 00000000000..38ad0b14d0e --- /dev/null +++ b/xlators/performance/io-cache/src/io-cache-messages.h @@ -0,0 +1,69 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _IO_CACHE_MESSAGES_H_ +#define _IO_CACHE_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(IO_CACHE, IO_CACHE_MSG_ENFORCEMENT_FAILED, + IO_CACHE_MSG_INVALID_ARGUMENT, + IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, IO_CACHE_MSG_NO_MEMORY, + IO_CACHE_MSG_VOL_MISCONFIGURED, IO_CACHE_MSG_INODE_NULL, + IO_CACHE_MSG_PAGE_WAIT_VALIDATE, IO_CACHE_MSG_STR_COVERSION_FAILED, + IO_CACHE_MSG_WASTED_COPY, IO_CACHE_MSG_SET_FD_FAILED, + IO_CACHE_MSG_TABLE_NULL, IO_CACHE_MSG_MEMORY_INIT_FAILED, + IO_CACHE_MSG_NO_CACHE_SIZE_OPT, IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE, + IO_CACHE_MSG_CREATE_MEM_POOL_FAILED, + IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, IO_CACHE_MSG_NULL_PAGE_WAIT, + IO_CACHE_MSG_FRAME_NULL, IO_CACHE_MSG_PAGE_FAULT, + IO_CACHE_MSG_SERVE_READ_REQUEST, IO_CACHE_MSG_LOCAL_NULL, + IO_CACHE_MSG_DEFAULTING_TO_OLD); + +#define IO_CACHE_MSG_NO_MEMORY_STR "out of memory" +#define IO_CACHE_MSG_ENFORCEMENT_FAILED_STR "inode context is NULL" +#define IO_CACHE_MSG_SET_FD_FAILED_STR "failed to set fd ctx" +#define IO_CACHE_MSG_TABLE_NULL_STR "table is NULL" +#define IO_CACHE_MSG_MEMORY_INIT_FAILED_STR "Memory accounting init failed" +#define IO_CACHE_MSG_NO_CACHE_SIZE_OPT_STR "could not get cache-size option" +#define IO_CACHE_MSG_INVALID_ARGUMENT_STR \ + "file size is greater than the max size" +#define IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE_STR "Not reconfiguring cache-size" +#define IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED_STR \ + "FATAL: io-cache not configured with exactly one child" +#define IO_CACHE_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile" +#define IO_CACHE_MSG_CREATE_MEM_POOL_FAILED_STR \ + "failed to create local_t's memory pool" +#define IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED_STR "Unable to allocate mem_pool" +#define IO_CACHE_MSG_STR_COVERSION_FAILED_STR \ + "asprintf failed while converting prt to str" +#define IO_CACHE_MSG_INODE_NULL_STR "ioc_inode is NULL" +#define IO_CACHE_MSG_PAGE_WAIT_VALIDATE_STR \ + "cache validate called without any page waiting to be validated" +#define IO_CACHE_MSG_NULL_PAGE_WAIT_STR "asked to wait on a NULL page" +#define IO_CACHE_MSG_WASTED_COPY_STR "wasted copy" +#define IO_CACHE_MSG_FRAME_NULL_STR "frame>root>rsp_refs is null" +#define IO_CACHE_MSG_PAGE_FAULT_STR "page fault on a NULL frame" +#define IO_CACHE_MSG_SERVE_READ_REQUEST_STR \ + "NULL page has been provided to serve read request" +#define IO_CACHE_MSG_LOCAL_NULL_STR "local is NULL" +#define IO_CACHE_MSG_DEFAULTING_TO_OLD_STR \ + "minimum size of file that can be cached is greater than maximum size. " \ + "Hence Defaulting to old value" +#endif /* _IO_CACHE_MESSAGES_H_ */ diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index 009e7cf28e1..9375d29c17f 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -1,59 +1,45 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <math.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-cache.h" #include "ioc-mem-types.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include <assert.h> #include <sys/time.h> - +#include "io-cache-messages.h" int ioc_log2_page_size; uint32_t -ioc_get_priority (ioc_table_t *table, const char *path); - -uint32_t -ioc_get_priority (ioc_table_t *table, const char *path); +ioc_get_priority(ioc_table_t *table, const char *path); struct volume_options options[]; - -inline uint32_t -ioc_hashfn (void *data, int len) +static uint32_t +ioc_hashfn(void *data, int len) { - off_t offset; + off_t offset; - offset = *(off_t *) data; + offset = *(off_t *)data; - return (offset >> ioc_log2_page_size); + return (offset >> ioc_log2_page_size); } -inline ioc_inode_t * +/* TODO: This function is not used, uncomment when we find a + usage for this function. + +static ioc_inode_t * ioc_inode_reupdate (ioc_inode_t *ioc_inode) { ioc_table_t *table = NULL; @@ -66,7 +52,8 @@ ioc_inode_reupdate (ioc_inode_t *ioc_inode) return ioc_inode; } -inline ioc_inode_t * + +static ioc_inode_t * ioc_get_inode (dict_t *dict, char *name) { ioc_inode_t *ioc_inode = NULL; @@ -89,22 +76,74 @@ ioc_get_inode (dict_t *dict, char *name) return ioc_inode; } +*/ -int32_t -ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) +int +ioc_update_pages(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iovec *vector, int32_t count, int op_ret, off_t offset) { - int8_t need_revalidate = 0; - struct timeval tv = {0,}; - ioc_table_t *table = NULL; + size_t size = 0; + off_t rounded_offset = 0, rounded_end = 0, trav_offset = 0, + write_offset = 0; + off_t page_offset = 0, page_end = 0; + ioc_page_t *trav = NULL; - table = ioc_inode->table; + size = iov_length(vector, count); + size = min(size, op_ret); + + rounded_offset = gf_floor(offset, ioc_inode->table->page_size); + rounded_end = gf_roof(offset + size, ioc_inode->table->page_size); - gettimeofday (&tv, NULL); + trav_offset = rounded_offset; + ioc_inode_lock(ioc_inode); + { + while (trav_offset < rounded_end) { + trav = __ioc_page_get(ioc_inode, trav_offset); + if (trav && trav->ready) { + if (trav_offset == rounded_offset) + page_offset = offset - rounded_offset; + else + page_offset = 0; + + if ((trav_offset + ioc_inode->table->page_size) >= + rounded_end) { + page_end = trav->size - (rounded_end - (offset + size)); + } else { + page_end = trav->size; + } + + iov_range_copy(trav->vector, trav->count, page_offset, vector, + count, write_offset, page_end - page_offset); + } else if (trav) { + if (!trav->waitq) + ioc_inode->table->cache_used -= __ioc_page_destroy(trav); + } + + if (trav_offset == rounded_offset) + write_offset += (ioc_inode->table->page_size - + (offset - rounded_offset)); + else + write_offset += ioc_inode->table->page_size; + + trav_offset += ioc_inode->table->page_size; + } + } + ioc_inode_unlock(ioc_inode); - if (time_elapsed (&tv, &ioc_inode->cache.tv) >= table->cache_timeout) - need_revalidate = 1; + return 0; +} + +static gf_boolean_t +ioc_inode_need_revalidate(ioc_inode_t *ioc_inode) +{ + ioc_table_t *table = NULL; + + GF_ASSERT(ioc_inode); + table = ioc_inode->table; + GF_ASSERT(table); - return need_revalidate; + return (gf_time() - ioc_inode->cache.last_revalidate >= + table->cache_timeout); } /* @@ -115,193 +154,201 @@ ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) * assumes lock is held */ int64_t -__ioc_inode_flush (ioc_inode_t *ioc_inode) +__ioc_inode_flush(ioc_inode_t *ioc_inode) { - ioc_page_t *curr = NULL, *next = NULL; - int64_t destroy_size = 0; - int64_t ret = 0; + ioc_page_t *curr = NULL, *next = NULL; + int64_t destroy_size = 0; + int64_t ret = 0; - list_for_each_entry_safe (curr, next, &ioc_inode->cache.page_lru, - page_lru) { - ret = __ioc_page_destroy (curr); + list_for_each_entry_safe(curr, next, &ioc_inode->cache.page_lru, page_lru) + { + ret = __ioc_page_destroy(curr); - if (ret != -1) - destroy_size += ret; - } + if (ret != -1) + destroy_size += ret; + } - return destroy_size; + return destroy_size; } void -ioc_inode_flush (ioc_inode_t *ioc_inode) +ioc_inode_flush(ioc_inode_t *ioc_inode) { - int64_t destroy_size = 0; + int64_t destroy_size = 0; - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + destroy_size = __ioc_inode_flush(ioc_inode); + } + ioc_inode_unlock(ioc_inode); - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); + if (destroy_size) { + ioc_table_lock(ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; } + ioc_table_unlock(ioc_inode->table); + } - return; + return; } int32_t -ioc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +ioc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop); - return 0; + STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop, postop, xdata); + return 0; } int32_t -ioc_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) +ioc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + inode_ctx_get(loc->inode, this, &ioc_inode); - if (ioc_inode - && ((valid & GF_SET_ATTR_ATIME) - || (valid & GF_SET_ATTR_MTIME))) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode && + ((valid & GF_SET_ATTR_ATIME) || (valid & GF_SET_ATTR_MTIME))) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_setattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid); + STACK_WIND(frame, ioc_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); - return 0; + return 0; } int32_t -ioc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *dict, struct iatt *postparent) +ioc_inode_update(xlator_t *this, inode_t *inode, char *path, struct iatt *iabuf) { - ioc_inode_t *ioc_inode = NULL; - ioc_table_t *table = NULL; - uint8_t cache_still_valid = 0; - uint64_t tmp_ioc_inode = 0; - uint32_t weight = 0xffffffff; - const char *path = NULL; - ioc_local_t *local = NULL; - - if (op_ret != 0) - goto out; + ioc_table_t *table = NULL; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + gf_boolean_t cache_still_valid = _gf_false; - local = frame->local; - if (local == NULL) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + if (!this || !inode) + goto out; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; + table = this->private; + + LOCK(&inode->lock); + { + (void)__inode_ctx_get(inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + + if (!ioc_inode) { + weight = ioc_get_priority(table, path); + + ioc_inode = ioc_inode_create(table, inode, weight); + + (void)__inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode); } + } + UNLOCK(&inode->lock); - table = this->private; + ioc_inode_lock(ioc_inode); + { + if (ioc_inode->cache.mtime == 0) { + ioc_inode->cache.mtime = iabuf->ia_mtime; + ioc_inode->cache.mtime_nsec = iabuf->ia_mtime_nsec; + } - path = local->file_loc.path; + ioc_inode->ia_size = iabuf->ia_size; + } + ioc_inode_unlock(ioc_inode); - LOCK (&inode->lock); - { - __inode_ctx_get (inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + cache_still_valid = ioc_cache_still_valid(ioc_inode, iabuf); - if (!ioc_inode) { - weight = ioc_get_priority (table, path); + if (!cache_still_valid) { + ioc_inode_flush(ioc_inode); + } - ioc_inode = ioc_inode_update (table, inode, - weight); + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock(ioc_inode->table); - __inode_ctx_put (inode, this, - (uint64_t)(long)ioc_inode); - } - } - UNLOCK (&inode->lock); +out: + return 0; +} - ioc_inode_lock (ioc_inode); - { - if (ioc_inode->cache.mtime == 0) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; - } +int32_t +ioc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) +{ + ioc_local_t *local = NULL; - ioc_inode->ia_size = stbuf->ia_size; - } - ioc_inode_unlock (ioc_inode); + if (op_ret != 0) + goto out; - cache_still_valid = ioc_cache_still_valid (ioc_inode, - stbuf); + local = frame->local; + if (local == NULL) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - if (!cache_still_valid) { - ioc_inode_flush (ioc_inode); - } + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); + ioc_inode_update(this, inode, (char *)local->file_loc.path, stbuf); out: - if (frame->local != NULL) { - local = frame->local; - loc_wipe (&local->file_loc); - } + if (frame->local != NULL) { + local = frame->local; + loc_wipe(&local->file_loc); + } - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, stbuf, - dict, postparent); - return 0; + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xdata, + postparent); + return 0; } int32_t -ioc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) +ioc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - ioc_local_t *local = NULL; - int32_t op_errno = -1, ret = -1; + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; - local = GF_CALLOC (1, sizeof (*local), - gf_ioc_mt_ioc_local_t); - if (local == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - ret = loc_copy (&local->file_loc, loc); - if (ret != 0) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + ret = loc_copy(&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_lookup_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + STACK_WIND(frame, ioc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, NULL, NULL, - NULL, NULL); + if (local != NULL) { + loc_wipe(&local->file_loc); + mem_put(local); + } - return 0; + STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; } /* @@ -313,18 +360,30 @@ unwind: * */ int32_t -ioc_forget (xlator_t *this, inode_t *inode) +ioc_forget(xlator_t *this, inode_t *inode) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (inode, this, &ioc_inode); + inode_ctx_get(inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_destroy((ioc_inode_t *)(long)ioc_inode); - return 0; + return 0; } +static int32_t +ioc_invalidate(xlator_t *this, inode_t *inode) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get(inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(uintptr_t)ioc_inode); + + return 0; +} /* * ioc_cache_validate_cbk - @@ -338,103 +397,103 @@ ioc_forget (xlator_t *this, inode_t *inode) * */ int32_t -ioc_cache_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf) +ioc_cache_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_inode_t *ioc_inode = NULL; - size_t destroy_size = 0; - struct iatt *local_stbuf = NULL; - - local = frame->local; - ioc_inode = local->inode; - local_stbuf = stbuf; - - if ((op_ret == -1) || - ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, - "cache for inode(%p) is invalid. flushing all pages", - ioc_inode); - /* NOTE: only pages with no waiting frames are flushed by - * ioc_inode_flush. page_fault will be generated for all - * the pages which have waiting frames by ioc_inode_wakeup() - */ - ioc_inode_lock (ioc_inode); - { - destroy_size = __ioc_inode_flush (ioc_inode); - if (op_ret >= 0) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec - = stbuf->ia_mtime_nsec; - } - } - ioc_inode_unlock (ioc_inode); - local_stbuf = NULL; - } - - if (destroy_size) { - ioc_table_lock (ioc_inode->table); - { - ioc_inode->table->cache_used -= destroy_size; - } - ioc_table_unlock (ioc_inode->table); + ioc_local_t *local = NULL; + ioc_inode_t *ioc_inode = NULL; + size_t destroy_size = 0; + struct iatt *local_stbuf = NULL; + + local = frame->local; + ioc_inode = local->inode; + local_stbuf = stbuf; + + if ((op_ret == -1) || + ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_msg_debug(ioc_inode->table->xl->name, 0, + "cache for inode(%p) is invalid. flushing all pages", + ioc_inode); + /* NOTE: only pages with no waiting frames are flushed by + * ioc_inode_flush. page_fault will be generated for all + * the pages which have waiting frames by ioc_inode_wakeup() + */ + ioc_inode_lock(ioc_inode); + { + destroy_size = __ioc_inode_flush(ioc_inode); + if (op_ret >= 0) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } + } + ioc_inode_unlock(ioc_inode); + local_stbuf = NULL; + } + + if (destroy_size) { + ioc_table_lock(ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; } + ioc_table_unlock(ioc_inode->table); + } - if (op_ret < 0) - local_stbuf = NULL; + if (op_ret < 0) + local_stbuf = NULL; - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->cache.tv, NULL); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.last_revalidate = gf_time(); + } + ioc_inode_unlock(ioc_inode); - ioc_inode_wakeup (frame, ioc_inode, local_stbuf); + ioc_inode_wakeup(frame, ioc_inode, local_stbuf); - /* any page-fault initiated by ioc_inode_wakeup() will have its own - * fd_ref on fd, safe to unref validate frame's private copy - */ - fd_unref (local->fd); + /* any page-fault initiated by ioc_inode_wakeup() will have its own + * fd_ref on fd, safe to unref validate frame's private copy + */ + fd_unref(local->fd); + dict_unref(local->xattr_req); - STACK_DESTROY (frame->root); + STACK_DESTROY(frame->root); - return 0; + return 0; } int32_t -ioc_wait_on_inode (ioc_inode_t *ioc_inode, ioc_page_t *page) +ioc_wait_on_inode(ioc_inode_t *ioc_inode, ioc_page_t *page) { - ioc_waitq_t *waiter = NULL, *trav = NULL; - uint32_t page_found = 0; - int32_t ret = 0; + ioc_waitq_t *waiter = NULL, *trav = NULL; + uint32_t page_found = 0; + int32_t ret = 0; - trav = ioc_inode->waitq; + trav = ioc_inode->waitq; - while (trav) { - if (trav->data == page) { - page_found = 1; - break; - } - trav = trav->next; + while (trav) { + if (trav->data == page) { + page_found = 1; + break; } + trav = trav->next; + } - if (!page_found) { - waiter = GF_CALLOC (1, sizeof (ioc_waitq_t), - gf_ioc_mt_ioc_waitq_t); - if (waiter == NULL) { - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - ret = -ENOMEM; - goto out; - } - - waiter->data = page; - waiter->next = ioc_inode->waitq; - ioc_inode->waitq = waiter; + if (!page_found) { + waiter = GF_CALLOC(1, sizeof(ioc_waitq_t), gf_ioc_mt_ioc_waitq_t); + if (waiter == NULL) { + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_NO_MEMORY, NULL); + ret = -ENOMEM; + goto out; } + waiter->data = page; + waiter->next = ioc_inode->waitq; + ioc_inode->waitq = waiter; + } + out: - return ret; + return ret; } /* @@ -446,75 +505,77 @@ out: * */ int32_t -ioc_cache_validate (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - ioc_page_t *page) +ioc_cache_validate(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, + ioc_page_t *page) { - call_frame_t *validate_frame = NULL; - ioc_local_t *validate_local = NULL; - ioc_local_t *local = NULL; - int32_t ret = 0; - - local = frame->local; - validate_local = GF_CALLOC (1, sizeof (ioc_local_t), - gf_ioc_mt_ioc_local_t); - if (validate_local == NULL) { - ret = -1; - local->op_ret = -1; - local->op_errno = ENOMEM; - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - validate_frame = copy_frame (frame); - if (validate_frame == NULL) { - ret = -1; - local->op_ret = -1; - local->op_errno = ENOMEM; - GF_FREE (validate_local); - gf_log (ioc_inode->table->xl->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - validate_local->fd = fd_ref (fd); - validate_local->inode = ioc_inode; - validate_frame->local = validate_local; - - STACK_WIND (validate_frame, ioc_cache_validate_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->fstat, fd); + call_frame_t *validate_frame = NULL; + ioc_local_t *validate_local = NULL; + ioc_local_t *local = NULL; + int32_t ret = 0; + + local = frame->local; + validate_local = mem_get0(THIS->local_pool); + if (validate_local == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + validate_frame = copy_frame(frame); + if (validate_frame == NULL) { + ret = -1; + local->op_ret = -1; + local->op_errno = ENOMEM; + mem_put(validate_local); + gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + validate_local->fd = fd_ref(fd); + validate_local->inode = ioc_inode; + if (local && local->xattr_req) + validate_local->xattr_req = dict_ref(local->xattr_req); + validate_frame->local = validate_local; + + STACK_WIND(validate_frame, ioc_cache_validate_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->fstat, fd, + validate_local->xattr_req); out: - return ret; + return ret; } -inline uint32_t -is_match (const char *path, const char *pattern) +static uint32_t +is_match(const char *path, const char *pattern) { - int32_t ret = 0; + int32_t ret = 0; - ret = fnmatch (pattern, path, FNM_NOESCAPE); + ret = fnmatch(pattern, path, FNM_NOESCAPE); - return (ret == 0); + return (ret == 0); } uint32_t -ioc_get_priority (ioc_table_t *table, const char *path) +ioc_get_priority(ioc_table_t *table, const char *path) { - uint32_t priority = 1; - struct ioc_priority *curr = NULL; + uint32_t priority = 1; + struct ioc_priority *curr = NULL; - if (list_empty(&table->priority_list)) - return priority; + if (list_empty(&table->priority_list) || !path) + return priority; - priority = 0; - list_for_each_entry (curr, &table->priority_list, list) { - if (is_match (path, curr->pattern)) - priority = curr->priority; - } + priority = 0; + list_for_each_entry(curr, &table->priority_list, list) + { + if (is_match(path, curr->pattern)) + priority = curr->priority; + } - return priority; + return priority; } /* @@ -529,72 +590,68 @@ ioc_get_priority (ioc_table_t *table, const char *path) * */ int32_t -ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) +ioc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; + uint64_t tmp_ioc_inode = 0; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; + + if (op_ret != -1) { + inode_ctx_get(fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; + // TODO: see why inode context is NULL and handle it. + if (!ioc_inode) { + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, + IO_CACHE_MSG_ENFORCEMENT_FAILED, "inode-gfid=%s", + uuid_utoa(fd->inode->gfid), NULL); + goto out; } - table = this->private; - - if (op_ret != -1) { - inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &table->inode_lru[ioc_inode->weight]); - } - ioc_table_unlock (ioc_inode->table); + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock(ioc_inode->table); - ioc_inode_lock (ioc_inode); - { - if ((table->min_file_size > ioc_inode->ia_size) - || ((table->max_file_size > 0) - && (table->max_file_size < ioc_inode->ia_size))) { - fd_ctx_set (fd, this, 1); - } - } - ioc_inode_unlock (ioc_inode); - - /* If O_DIRECT open, we disable caching on it */ - if ((local->flags & O_DIRECT)){ - /* O_DIRECT is only for one fd, not the inode - * as a whole - */ - fd_ctx_set (fd, this, 1); - } - if ((local->wbflags & GF_OPEN_NOWB) != 0) { - /* disable caching as asked by NFS */ - fd_ctx_set (fd, this, 1); - } + ioc_inode_lock(ioc_inode); + { + if ((table->min_file_size > ioc_inode->ia_size) || + ((table->max_file_size > 0) && + (table->max_file_size < ioc_inode->ia_size))) { + fd_ctx_set(fd, this, 1); + } + } + ioc_inode_unlock(ioc_inode); - /* weight = 0, we disable caching on it */ - if (weight == 0) { - /* we allow a pattern-matched cache disable this way - */ - fd_ctx_set (fd, this, 1); - } + /* If O_DIRECT open, we disable caching on it */ + if ((local->flags & O_DIRECT)) { + /* O_DIRECT is only for one fd, not the inode + * as a whole + */ + fd_ctx_set(fd, this, 1); } + } out: - GF_FREE (local); - frame->local = NULL; + mem_put(local); + frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } /* @@ -611,185 +668,175 @@ out: * */ int32_t -ioc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +ioc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; - const char *path = NULL; - int ret = -1; - - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - table = this->private; - path = local->file_loc.path; - - if (op_ret != -1) { - /* assign weight */ - weight = ioc_get_priority (table, path); - - ioc_inode = ioc_inode_update (table, inode, weight); - - ioc_inode_lock (ioc_inode); - { - ioc_inode->cache.mtime = buf->ia_mtime; - ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; - ioc_inode->ia_size = buf->ia_size; - - if ((table->min_file_size > ioc_inode->ia_size) - || ((table->max_file_size > 0) - && (table->max_file_size < ioc_inode->ia_size))) { - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - } - ioc_inode_unlock (ioc_inode); - - inode_ctx_put (fd->inode, this, - (uint64_t)(long)ioc_inode); - - /* If O_DIRECT open, we disable caching on it */ - if (local->flags & O_DIRECT) { - /* - * O_DIRECT is only for one fd, not the inode - * as a whole */ - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - - /* if weight == 0, we disable caching on it */ - if (!weight) { - /* we allow a pattern-matched cache disable this way */ - ret = fd_ctx_set (fd, this, 1); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set fd ctx", - local->file_loc.path); - } - - } + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; + int ret = -1; + + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + table = this->private; + path = local->file_loc.path; + + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority(table, path); + + ioc_inode = ioc_inode_create(table, inode, weight); + + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; + + if ((table->min_file_size > ioc_inode->ia_size) || + ((table->max_file_size > 0) && + (table->max_file_size < ioc_inode->ia_size))) { + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + } + ioc_inode_unlock(ioc_inode); + + inode_ctx_put(fd->inode, this, (uint64_t)(long)ioc_inode); + + /* If O_DIRECT open, we disable caching on it */ + if (local->flags & O_DIRECT) { + /* + * O_DIRECT is only for one fd, not the inode + * as a whole */ + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + + /* if weight == 0, we disable caching on it */ + if (!weight) { + /* we allow a pattern-matched cache disable this way */ + ret = fd_ctx_set(fd, this, 1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_SET_FD_FAILED, "path=%s", + local->file_loc.path, NULL); + } + } out: - frame->local = NULL; - GF_FREE (local); + frame->local = NULL; + mem_put(local); - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int32_t -ioc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +ioc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_inode_t *ioc_inode = NULL; - uint32_t weight = 0xffffffff; - const char *path = NULL; + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0xffffffff; + const char *path = NULL; - local = frame->local; - if (!this || !this->private) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + local = frame->local; + if (!this || !this->private) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - table = this->private; - path = local->file_loc.path; + table = this->private; + path = local->file_loc.path; - if (op_ret != -1) { - /* assign weight */ - weight = ioc_get_priority (table, path); + if (op_ret != -1) { + /* assign weight */ + weight = ioc_get_priority(table, path); - ioc_inode = ioc_inode_update (table, inode, weight); - - ioc_inode_lock (ioc_inode); - { - ioc_inode->cache.mtime = buf->ia_mtime; - ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; - ioc_inode->ia_size = buf->ia_size; - } - ioc_inode_unlock (ioc_inode); + ioc_inode = ioc_inode_create(table, inode, weight); - inode_ctx_put (inode, this, - (uint64_t)(long)ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.mtime = buf->ia_mtime; + ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec; + ioc_inode->ia_size = buf->ia_size; } + ioc_inode_unlock(ioc_inode); + + inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode); + } out: - frame->local = NULL; + frame->local = NULL; - loc_wipe (&local->file_loc); - GF_FREE (local); + loc_wipe(&local->file_loc); + mem_put(local); - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; + STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; } - int -ioc_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) +ioc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - ioc_local_t *local = NULL; - int32_t op_errno = -1, ret = -1; + ioc_local_t *local = NULL; + int32_t op_errno = -1, ret = -1; - local = GF_CALLOC (1, sizeof (*local), - gf_ioc_mt_ioc_local_t); - if (local == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - ret = loc_copy (&local->file_loc, loc); - if (ret != 0) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto unwind; - } + ret = loc_copy(&local->file_loc, loc); + if (ret != 0) { + op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL); + goto unwind; + } - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_mknod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, params); - return 0; + STACK_WIND(frame, ioc_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; unwind: - if (local != NULL) { - loc_wipe (&local->file_loc); - GF_FREE (local); - } + if (local != NULL) { + loc_wipe(&local->file_loc); + mem_put(local); + } - STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, NULL, - NULL, NULL); + STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } - /* * ioc_open - open fop for io cache * @frame: @@ -799,30 +846,28 @@ unwind: * */ int32_t -ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +ioc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { + ioc_local_t *local = NULL; - ioc_local_t *local = NULL; - - local = GF_CALLOC (1, sizeof (ioc_local_t), gf_ioc_mt_ioc_local_t); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL); - return 0; - } + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); + return 0; + } - local->flags = flags; - local->file_loc.path = loc->path; - local->file_loc.inode = loc->inode; - local->wbflags = wbflags; + local->flags = flags; + local->file_loc.path = loc->path; + local->file_loc.inode = loc->inode; - frame->local = local; + frame->local = local; - STACK_WIND (frame, ioc_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, wbflags); + STACK_WIND(frame, ioc_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; + return 0; } /* @@ -836,32 +881,29 @@ ioc_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, * */ int32_t -ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd, dict_t *params) +ioc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - ioc_local_t *local = NULL; - - local = GF_CALLOC (1, sizeof (ioc_local_t), gf_ioc_mt_ioc_local_t); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, - NULL, NULL, NULL); - return 0; - } - - local->flags = flags; - local->file_loc.path = loc->path; - frame->local = local; - - STACK_WIND (frame, ioc_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, mode, - fd, params); + ioc_local_t *local = NULL; + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; -} + } + local->flags = flags; + local->file_loc.path = loc->path; + frame->local = local; + STACK_WIND(frame, ioc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} /* * ioc_release - release fop for io cache @@ -872,49 +914,26 @@ ioc_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, * */ int32_t -ioc_release (xlator_t *this, fd_t *fd) +ioc_release(xlator_t *this, fd_t *fd) { - return 0; + return 0; } -/* - * ioc_readv_disabled_cbk - * @frame: - * @cookie: - * @this: - * @op_ret: - * @op_errno: - * @vector: - * @count: - * - */ int32_t -ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, - struct iobref *iobref) +ioc_need_prune(ioc_table_t *table) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); - return 0; -} + int64_t cache_difference = 0; + ioc_table_lock(table); + { + cache_difference = table->cache_used - table->cache_size; + } + ioc_table_unlock(table); -int32_t -ioc_need_prune (ioc_table_t *table) -{ - int64_t cache_difference = 0; - - ioc_table_lock (table); - { - cache_difference = table->cache_used - table->cache_size; - } - ioc_table_unlock (table); - - if (cache_difference > 0) - return 1; - else - return 0; + if (cache_difference > 0) + return 1; + else + return 0; } /* @@ -926,155 +945,151 @@ ioc_need_prune (ioc_table_t *table) * */ void -ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, - off_t offset, size_t size) +ioc_dispatch_requests(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd, + off_t offset, size_t size) { - ioc_local_t *local = NULL; - ioc_table_t *table = NULL; - ioc_page_t *trav = NULL; - ioc_waitq_t *waitq = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - int32_t fault = 0; - size_t trav_size = 0; - off_t local_offset = 0; - int32_t ret = -1; - int8_t need_validate = 0; - int8_t might_need_validate = 0; /* - * if a page exists, do we need - * to validate it? - */ - local = frame->local; - table = ioc_inode->table; - - rounded_offset = floor (offset, table->page_size); - rounded_end = roof (offset + size, table->page_size); - trav_offset = rounded_offset; - - /* once a frame does read, it should be waiting on something */ - local->wait_count++; - - /* Requested region can fall in three different pages, - * 1. Ready - region is already in cache, we just have to serve it. - * 2. In-transit - page fault has been generated on this page, we need - * to wait till the page is ready - * 3. Fault - page is not in cache, we have to generate a page fault - */ + ioc_local_t *local = NULL; + ioc_table_t *table = NULL; + ioc_page_t *trav = NULL; + ioc_waitq_t *waitq = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + int32_t fault = 0; + size_t trav_size = 0; + off_t local_offset = 0; + int32_t ret = -1; + int8_t need_validate = 0; + int8_t might_need_validate = 0; /* + * if a page exists, do we need + * to validate it? + */ + local = frame->local; + table = ioc_inode->table; + + rounded_offset = gf_floor(offset, table->page_size); + rounded_end = gf_roof(offset + size, table->page_size); + trav_offset = rounded_offset; + + /* once a frame does read, it should be waiting on something */ + local->wait_count++; + + /* Requested region can fall in three different pages, + * 1. Ready - region is already in cache, we just have to serve it. + * 2. In-transit - page fault has been generated on this page, we need + * to wait till the page is ready + * 3. Fault - page is not in cache, we have to generate a page fault + */ + + might_need_validate = ioc_inode_need_revalidate(ioc_inode); + + while (trav_offset < rounded_end) { + ioc_inode_lock(ioc_inode); + { + /* look for requested region in the cache */ + trav = __ioc_page_get(ioc_inode, trav_offset); - might_need_validate = ioc_inode_need_revalidate (ioc_inode); + local_offset = max(trav_offset, offset); + trav_size = min(((offset + size) - local_offset), table->page_size); - while (trav_offset < rounded_end) { - ioc_inode_lock (ioc_inode); - { - /* look for requested region in the cache */ - trav = __ioc_page_get (ioc_inode, trav_offset); - - local_offset = max (trav_offset, offset); - trav_size = min (((offset+size) - local_offset), - table->page_size); - - if (!trav) { - /* page not in cache, we need to generate page - * fault - */ - trav = __ioc_page_create (ioc_inode, - trav_offset); - fault = 1; - if (!trav) { - gf_log (frame->this->name, - GF_LOG_CRITICAL, - "out of memory"); - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - } + if (!trav) { + /* page not in cache, we need to generate page + * fault + */ + trav = __ioc_page_create(ioc_inode, trav_offset); + fault = 1; + if (!trav) { + gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM, + IO_CACHE_MSG_NO_MEMORY, NULL); + local->op_ret = -1; + local->op_errno = ENOMEM; + ioc_inode_unlock(ioc_inode); + goto out; + } + } + + __ioc_wait_on_page(trav, frame, local_offset, trav_size); + + if (trav->ready) { + /* page found in cache */ + if (!might_need_validate && !ioc_inode->waitq) { + /* fresh enough */ + gf_msg_trace(frame->this->name, 0, + "cache hit for " + "trav_offset=%" PRId64 + "/local_" + "offset=%" PRId64 "", + trav_offset, local_offset); + waitq = __ioc_page_wakeup(trav, trav->op_errno); + } else { + /* if waitq already exists, fstat + * revalidate is + * already on the way + */ + if (!ioc_inode->waitq) { + need_validate = 1; + } + + ret = ioc_wait_on_inode(ioc_inode, trav); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + need_validate = 0; - __ioc_wait_on_page (trav, frame, local_offset, - trav_size); - - if (trav->ready) { - /* page found in cache */ - if (!might_need_validate && !ioc_inode->waitq) { - /* fresh enough */ - gf_log (frame->this->name, GF_LOG_TRACE, - "cache hit for trav_offset=%" - PRId64"/local_offset=%"PRId64"", - trav_offset, local_offset); - waitq = __ioc_page_wakeup (trav); - } else { - /* if waitq already exists, fstat - * revalidate is - * already on the way - */ - if (!ioc_inode->waitq) { - need_validate = 1; - } - - ret = ioc_wait_on_inode (ioc_inode, - trav); - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; - need_validate = 0; - - waitq = __ioc_page_wakeup (trav); - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - waitq = NULL; - goto out; - } - } - } + waitq = __ioc_page_wakeup(trav, trav->op_errno); + ioc_inode_unlock(ioc_inode); + ioc_waitq_return(waitq); + waitq = NULL; + goto out; + } } - ioc_inode_unlock (ioc_inode); + } + } + ioc_inode_unlock(ioc_inode); - ioc_waitq_return (waitq); - waitq = NULL; + ioc_waitq_return(waitq); + waitq = NULL; - if (fault) { - fault = 0; - /* new page created, increase the table->cache_used */ - ioc_page_fault (ioc_inode, frame, fd, trav_offset); - } + if (fault) { + fault = 0; + /* new page created, increase the table->cache_used */ + ioc_page_fault(ioc_inode, frame, fd, trav_offset); + } - if (need_validate) { - need_validate = 0; - gf_log (frame->this->name, GF_LOG_TRACE, - "sending validate request for " - "inode(%s) at offset=%"PRId64"", - uuid_utoa (fd->inode->gfid), trav_offset); - ret = ioc_cache_validate (frame, ioc_inode, fd, trav); - if (ret == -1) { - ioc_inode_lock (ioc_inode); - { - waitq = __ioc_page_wakeup (trav); - } - ioc_inode_unlock (ioc_inode); - - ioc_waitq_return (waitq); - waitq = NULL; - goto out; - } + if (need_validate) { + need_validate = 0; + gf_msg_trace(frame->this->name, 0, + "sending validate request for " + "inode(%s) at offset=%" PRId64 "", + uuid_utoa(fd->inode->gfid), trav_offset); + ret = ioc_cache_validate(frame, ioc_inode, fd, trav); + if (ret == -1) { + ioc_inode_lock(ioc_inode); + { + waitq = __ioc_page_wakeup(trav, trav->op_errno); } + ioc_inode_unlock(ioc_inode); - trav_offset += table->page_size; + ioc_waitq_return(waitq); + waitq = NULL; + goto out; + } } + trav_offset += table->page_size; + } + out: - ioc_frame_return (frame); + ioc_frame_return(frame); - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } + if (ioc_need_prune(ioc_inode->table)) { + ioc_prune(ioc_inode->table); + } - return; + return; } - /* * ioc_readv - * @@ -1086,127 +1101,108 @@ out: * */ int32_t -ioc_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) +ioc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_local_t *local = NULL; - uint32_t weight = 0; - ioc_table_t *table = NULL; - uint32_t num_pages = 0; - int32_t op_errno = -1; - - if (!this) { - goto out; - } - - inode_ctx_get (fd->inode, this, &tmp_ioc_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; - if (!ioc_inode) { - /* caching disabled, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset); - return 0; - } - - - table = this->private; - - if (!table) { - gf_log (this->name, GF_LOG_ERROR, "table is null"); - op_errno = EINVAL; - goto out; - } - - - ioc_table_lock (table); - { - if (!table->mem_pool) { + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_local_t *local = NULL; + uint32_t weight = 0; + ioc_table_t *table = NULL; + int32_t op_errno = EINVAL; + + if (!this) { + goto out; + } + + inode_ctx_get(fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (!ioc_inode) { + /* caching disabled, go ahead with normal readv */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } - num_pages = (table->cache_size / table->page_size) - + ((table->cache_size % table->page_size) - ? 1 : 0); + if (flags & O_DIRECT) { + /* disable caching for this fd, if O_DIRECT is used */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } - table->mem_pool - = mem_pool_new (rbthash_entry_t, num_pages); + table = this->private; - if (!table->mem_pool) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to allocate mem_pool"); - op_errno = ENOMEM; - ioc_table_unlock (table); - goto out; - } - } - } - ioc_table_unlock (table); + if (!table) { + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, IO_CACHE_MSG_TABLE_NULL, + NULL); + op_errno = EINVAL; + goto out; + } - ioc_inode_lock (ioc_inode); - { - if (!ioc_inode->cache.page_table) { - ioc_inode->cache.page_table - = rbthash_table_init - (IOC_PAGE_TABLE_BUCKET_COUNT, - ioc_hashfn, NULL, 0, - table->mem_pool); - - if (ioc_inode->cache.page_table == NULL) { - op_errno = ENOMEM; - ioc_inode_unlock (ioc_inode); - goto out; - } - } - } - ioc_inode_unlock (ioc_inode); - - if (!fd_ctx_get (fd, this, NULL)) { - /* disable caching for this fd, go ahead with normal readv */ - STACK_WIND (frame, ioc_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, fd, size, - offset); - return 0; - } + ioc_inode_lock(ioc_inode); + { + if (!ioc_inode->cache.page_table) { + ioc_inode->cache.page_table = rbthash_table_init( + this->ctx, IOC_PAGE_TABLE_BUCKET_COUNT, ioc_hashfn, NULL, 0, + table->mem_pool); - local = (ioc_local_t *) GF_CALLOC (1, sizeof (ioc_local_t), - gf_ioc_mt_ioc_local_t); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); + if (ioc_inode->cache.page_table == NULL) { op_errno = ENOMEM; + ioc_inode_unlock(ioc_inode); goto out; + } } + } + ioc_inode_unlock(ioc_inode); - INIT_LIST_HEAD (&local->fill_list); - - frame->local = local; - local->pending_offset = offset; - local->pending_size = size; - local->offset = offset; - local->size = size; - local->inode = ioc_inode; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", - frame, offset, size); - - weight = ioc_inode->weight; - - ioc_table_lock (ioc_inode->table); - { - list_move_tail (&ioc_inode->inode_lru, - &ioc_inode->table->inode_lru[weight]); - } - ioc_table_unlock (ioc_inode->table); - - ioc_dispatch_requests (frame, ioc_inode, fd, offset, size); + if (!fd_ctx_get(fd, this, NULL)) { + /* disable caching for this fd, go ahead with normal readv */ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); return 0; + } + + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + op_errno = ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&local->fill_list); + + frame->local = local; + local->pending_offset = offset; + local->pending_size = size; + local->offset = offset; + local->size = size; + local->inode = ioc_inode; + local->xattr_req = dict_ref(xdata); + + gf_msg_trace(this->name, 0, + "NEW REQ (%p) offset " + "= %" PRId64 " && size = %" GF_PRI_SIZET "", + frame, offset, size); + + weight = ioc_inode->weight; + + ioc_table_lock(ioc_inode->table); + { + list_move_tail(&ioc_inode->inode_lru, + &ioc_inode->table->inode_lru[weight]); + } + ioc_table_unlock(ioc_inode->table); + + ioc_dispatch_requests(frame, ioc_inode, fd, offset, size); + return 0; out: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; } /* @@ -1220,21 +1216,31 @@ out: * */ int32_t -ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +ioc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; - - local = frame->local; - inode_ctx_get (local->fd->inode, this, &ioc_inode); - - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); - - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; + + local = frame->local; + frame->local = NULL; + inode_ctx_get(local->fd->inode, this, &ioc_inode); + + if (op_ret >= 0) { + ioc_update_pages(frame, (ioc_inode_t *)(long)ioc_inode, local->vector, + local->op_ret, op_ret, local->offset); + } + + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + if (local->iobref) { + iobref_unref(local->iobref); + GF_FREE(local->vector); + } + + mem_put(local); + return 0; } /* @@ -1249,34 +1255,38 @@ ioc_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +ioc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - ioc_local_t *local = NULL; - uint64_t ioc_inode = 0; + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; - local = GF_CALLOC (1, sizeof (ioc_local_t), gf_ioc_mt_ioc_local_t); - if (local == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); + local = mem_get0(this->local_pool); + if (local == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); - STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, NULL, NULL); - return 0; - } + STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } - /* TODO: why is it not fd_ref'ed */ - local->fd = fd; - frame->local = local; + /* TODO: why is it not fd_ref'ed */ + local->fd = fd; + frame->local = local; - inode_ctx_get (fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + inode_ctx_get(fd->inode, this, &ioc_inode); + if (ioc_inode) { + local->iobref = iobref_ref(iobref); + local->vector = iov_dup(vector, count); + local->op_ret = count; + local->offset = offset; + } - STACK_WIND (frame, ioc_writev_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, - iobref); + STACK_WIND(frame, ioc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); - return 0; + return 0; } /* @@ -1291,17 +1301,15 @@ ioc_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, * */ int32_t -ioc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +ioc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - /* * ioc_ftruncate_cbk - * @@ -1314,17 +1322,15 @@ ioc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +ioc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - /* * ioc_truncate - * @@ -1335,18 +1341,19 @@ ioc_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * */ int32_t -ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +ioc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (loc->inode, this, &ioc_inode); + inode_ctx_get(loc->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - return 0; + STACK_WIND(frame, ioc_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } /* @@ -1359,531 +1366,719 @@ ioc_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) * */ int32_t -ioc_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +ioc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - uint64_t ioc_inode = 0; + uint64_t ioc_inode = 0; - inode_ctx_get (fd->inode, this, &ioc_inode); + inode_ctx_get(fd->inode, this, &ioc_inode); - if (ioc_inode) - ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - STACK_WIND (frame, ioc_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - return 0; + STACK_WIND(frame, ioc_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } int32_t -ioc_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct gf_flock *lock) +ioc_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock); - return 0; + STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata); + return 0; } int32_t -ioc_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) +ioc_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) { - ioc_inode_t *ioc_inode = NULL; - uint64_t tmp_inode = 0; - - inode_ctx_get (fd->inode, this, &tmp_inode); - ioc_inode = (ioc_inode_t *)(long)tmp_inode; - if (!ioc_inode) { - gf_log (this->name, GF_LOG_DEBUG, - "inode context is NULL: returning EBADFD"); - STACK_UNWIND_STRICT (lk, frame, -1, EBADFD, NULL); - return 0; - } + ioc_inode_t *ioc_inode = NULL; + uint64_t tmp_inode = 0; + + inode_ctx_get(fd->inode, this, &tmp_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_inode; + if (!ioc_inode) { + gf_msg_debug(this->name, EBADFD, + "inode context is NULL: returning EBADFD"); + STACK_UNWIND_STRICT(lk, frame, -1, EBADFD, NULL, NULL); + return 0; + } - ioc_inode_lock (ioc_inode); - { - gettimeofday (&ioc_inode->cache.tv, NULL); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + ioc_inode->cache.last_revalidate = gf_time(); + } + ioc_inode_unlock(ioc_inode); - STACK_WIND (frame, ioc_lk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lk, fd, cmd, lock); + STACK_WIND(frame, ioc_lk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata); - return 0; + return 0; } -int32_t -ioc_get_priority_list (const char *opt_str, struct list_head *first) +int +ioc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - int32_t max_pri = 1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *stripe_str = NULL; - char *pattern = NULL; - char *priority = NULL; - char *string = NULL; - struct ioc_priority *curr = NULL, *tmp = NULL; - - string = gf_strdup (opt_str); - if (string == NULL) { - max_pri = -1; - goto out; - } - - /* Get the pattern for cache priority. - * "option priority *.jpg:1,abc*:2" etc - */ - /* TODO: inode_lru in table is statically hard-coded to 5, - * should be changed to run-time configuration - */ - stripe_str = strtok_r (string, ",", &tmp_str); - while (stripe_str) { - curr = GF_CALLOC (1, sizeof (struct ioc_priority), - gf_ioc_mt_ioc_priority); - if (curr == NULL) { - max_pri = -1; - goto out; - } + gf_dirent_t *entry = NULL; + char *path = NULL; + fd_t *fd = NULL; - list_add_tail (&curr->list, first); + fd = frame->local; + frame->local = NULL; - dup_str = gf_strdup (stripe_str); - if (dup_str == NULL) { - max_pri = -1; - goto out; - } + if (op_ret <= 0) + goto unwind; - pattern = strtok_r (dup_str, ":", &tmp_str1); - if (!pattern) { - max_pri = -1; - goto out; - } + list_for_each_entry(entry, &entries->list, list) + { + inode_path(fd->inode, entry->d_name, &path); + ioc_inode_update(this, entry->inode, path, &entry->d_stat); + GF_FREE(path); + path = NULL; + } - priority = strtok_r (NULL, ":", &tmp_str1); - if (!priority) { - max_pri = -1; - goto out; - } - - gf_log ("io-cache", GF_LOG_TRACE, - "ioc priority : pattern %s : priority %s", - pattern, - priority); - - curr->pattern = gf_strdup (pattern); - if (curr->pattern == NULL) { - max_pri = -1; - goto out; - } - - curr->priority = strtol (priority, &tmp_str2, 0); - if (tmp_str2 && (*tmp_str2)) { - max_pri = -1; - goto out; - } else { - max_pri = max (max_pri, curr->priority); - } - - GF_FREE (dup_str); - dup_str = NULL; +unwind: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); - stripe_str = strtok_r (NULL, ",", &tmp_str); - } -out: - if (string != NULL) { - GF_FREE (string); - } + return 0; +} - if (dup_str != NULL) { - GF_FREE (dup_str); - } +int +ioc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) +{ + frame->local = fd; - if (max_pri == -1) { - list_for_each_entry_safe (curr, tmp, first, list) { - list_del_init (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); - } - } + STACK_WIND(frame, ioc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict); - return max_pri; + return 0; } -int32_t -mem_acct_init (xlator_t *this) +static int32_t +ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - int ret = -1; + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} - if (!this) - return ret; +static int32_t +ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; - ret = xlator_mem_acct_init (this, gf_ioc_mt_end + 1); + inode_ctx_get(fd->inode, this, &ioc_inode); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - return ret; + STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; } +static int32_t +ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} -gf_boolean_t -check_cache_size_ok (xlator_t *this, uint64_t cache_size) +static int32_t +ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - gf_boolean_t ret = _gf_true; - uint64_t total_mem = 0; - uint64_t max_cache_size = 0; - volume_option_t *opt = NULL; - - GF_ASSERT (this); - opt = xlator_volume_option_get (this, "cache-size"); - if (!opt) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, - "could not get cache-size option"); - goto out; - } + uint64_t ioc_inode = 0; - total_mem = get_mem_size (); - if (-1 == total_mem) - max_cache_size = opt->max; - else - max_cache_size = total_mem; + inode_ctx_get(fd->inode, this, &ioc_inode); - gf_log (this->name, GF_LOG_INFO, "Max cache size is %"PRIu64, - max_cache_size); + if (ioc_inode) + ioc_inode_flush((ioc_inode_t *)(long)ioc_inode); - if (cache_size > max_cache_size) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 - " is greater than the max size of %"PRIu64, - cache_size, max_cache_size); - goto out; - } -out: - return ret; + STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; } -int -reconfigure (xlator_t *this, dict_t *options) +int32_t +ioc_get_priority_list(const char *opt_str, struct list_head *first) { - data_t *data = NULL; - ioc_table_t *table = NULL; - int ret = -1; - uint64_t cache_size_new = 0; - if (!this || !this->private) - goto out; + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; + + string = gf_strdup(opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + stripe_str = strtok_r(string, ",", &tmp_str); + while (stripe_str) { + curr = GF_CALLOC(1, sizeof(struct ioc_priority), + gf_ioc_mt_ioc_priority); + if (curr == NULL) { + max_pri = -1; + goto out; + } + + list_add_tail(&curr->list, first); + + dup_str = gf_strdup(stripe_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; + } + + pattern = strtok_r(dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } + + priority = strtok_r(NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; + } + + gf_msg_trace("io-cache", 0, "ioc priority : pattern %s : priority %s", + pattern, priority); + + curr->pattern = gf_strdup(pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; + } + + curr->priority = strtol(priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max(max_pri, curr->priority); + } + + GF_FREE(dup_str); + dup_str = NULL; + + stripe_str = strtok_r(NULL, ",", &tmp_str); + } +out: + GF_FREE(string); - table = this->private; + GF_FREE(dup_str); - ioc_table_lock (table); + if (max_pri == -1) { + list_for_each_entry_safe(curr, tmp, first, list) { - GF_OPTION_RECONF ("cache-timeout", table->cache_timeout, - options, int32, unlock); - - data = dict_get (options, "priority"); - if (data) { - char *option_list = data_to_str (data); + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); + } + } - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - table->max_pri = ioc_get_priority_list (option_list, - &table->priority_list); + return max_pri; +} - if (table->max_pri == -1) { - goto unlock; - } - table->max_pri ++; - } +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; - GF_OPTION_RECONF ("max-file-size", table->max_file_size, - options, size, unlock); + if (!this) + return ret; - GF_OPTION_RECONF ("min-file-size", table->min_file_size, - options, size, unlock); + ret = xlator_mem_acct_init(this, gf_ioc_mt_end + 1); - if ((table->max_file_size >= 0) && - (table->min_file_size > table->max_file_size)) { - gf_log (this->name, GF_LOG_ERROR, "minimum size (%" - PRIu64") of a file that can be cached is " - "greater than maximum size (%"PRIu64"). " - "Hence Defaulting to old value", - table->min_file_size, table->max_file_size); - goto unlock; - } + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_MEMORY_INIT_FAILED, NULL); + return ret; + } - GF_OPTION_RECONF ("cache-size", cache_size_new, - options, size, unlock); - if (!check_cache_size_ok (this, cache_size_new)) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "Not reconfiguring cache-size"); - goto unlock; - } - table->cache_size = cache_size_new; + return ret; +} - ret = 0; - } -unlock: - ioc_table_unlock (table); +static gf_boolean_t +check_cache_size_ok(xlator_t *this, uint64_t cache_size) +{ + gf_boolean_t ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT(this); + opt = xlator_volume_option_get(this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_smsg(this->name, GF_LOG_ERROR, EINVAL, + IO_CACHE_MSG_NO_CACHE_SIZE_OPT, NULL); + goto out; + } + + total_mem = get_mem_size(); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size); + + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT, + "Cache-size=%" PRIu64, cache_size, "max-size=%" PRIu64, + max_cache_size, NULL); + goto out; + } out: - return ret; + return ret; } - -/* - * init - - * @this: - * - */ -int32_t -init (xlator_t *this) +int +reconfigure(xlator_t *this, dict_t *options) { - ioc_table_t *table = NULL; - dict_t *xl_options = NULL; - uint32_t index = 0; - int32_t ret = -1; - glusterfs_ctx_t *ctx = NULL; - data_t *data = 0; - - xl_options = this->options; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: io-cache not configured with exactly " - "one child"); - goto out; - } + data_t *data = NULL; + ioc_table_t *table = NULL; + int ret = -1; + uint64_t cache_size_new = 0; + if (!this || !this->private) + goto out; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - table = (void *) GF_CALLOC (1, sizeof (*table), gf_ioc_mt_ioc_table_t); - if (table == NULL) { - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - goto out; - } + table = this->private; - table->xl = this; - table->page_size = this->ctx->page_size; + ioc_table_lock(table); + { + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, + unlock); - GF_OPTION_INIT ("cache-size", table->cache_size, size, out); + GF_OPTION_RECONF("cache-timeout", table->cache_timeout, options, int32, + unlock); - GF_OPTION_INIT ("cache-timeout", table->cache_timeout, int32, out); - - GF_OPTION_INIT ("min-file-size", table->min_file_size, size, out); + data = dict_get(options, "priority"); + if (data) { + char *option_list = data_to_str(data); - GF_OPTION_INIT ("max-file-size", table->max_file_size, size, out); + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list(option_list, + &table->priority_list); - if (!check_cache_size_ok (this, table->cache_size)) { - ret = -1; - goto out; + if (table->max_pri == -1) { + goto unlock; + } + table->max_pri++; } - INIT_LIST_HEAD (&table->priority_list); - table->max_pri = 1; - data = dict_get (xl_options, "priority"); - if (data) { - char *option_list = data_to_str (data); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - table->max_pri = ioc_get_priority_list (option_list, - &table->priority_list); - - if (table->max_pri == -1) { - goto out; - } - } - table->max_pri ++; + GF_OPTION_RECONF("max-file-size", table->max_file_size, options, + size_uint64, unlock); - INIT_LIST_HEAD (&table->inodes); + GF_OPTION_RECONF("min-file-size", table->min_file_size, options, + size_uint64, unlock); - if ((table->max_file_size >= 0) - && (table->min_file_size > table->max_file_size)) { - gf_log ("io-cache", GF_LOG_ERROR, "minimum size (%" - PRIu64") of a file that can be cached is " - "greater than maximum size (%"PRIu64")", - table->min_file_size, table->max_file_size); - goto out; + if ((table->max_file_size <= UINT64_MAX) && + (table->min_file_size > table->max_file_size)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_DEFAULTING_TO_OLD, + "minimum-size=%" PRIu64, table->min_file_size, + "maximum-size=%" PRIu64, table->max_file_size, NULL); + goto unlock; } - table->inode_lru = GF_CALLOC (table->max_pri, - sizeof (struct list_head), - gf_ioc_mt_list_head); - if (table->inode_lru == NULL) { - goto out; + GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64, + unlock); + if (!check_cache_size_ok(this, cache_size_new)) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE, NULL); + goto unlock; } + table->cache_size = cache_size_new; - for (index = 0; index < (table->max_pri); index++) - INIT_LIST_HEAD (&table->inode_lru[index]); - - pthread_mutex_init (&table->table_lock, NULL); - this->private = table; ret = 0; + } +unlock: + ioc_table_unlock(table); +out: + return ret; +} - ctx = this->ctx; - ioc_log2_page_size = log_base2 (ctx->page_size); +/* + * init - + * @this: + * + */ +int32_t +init(xlator_t *this) +{ + ioc_table_t *table = NULL; + dict_t *xl_options = NULL; + uint32_t index = 0; + int32_t ret = -1; + glusterfs_ctx_t *ctx = NULL; + data_t *data = 0; + uint32_t num_pages = 0; + + xl_options = this->options; + + if (!this->children || this->children->next) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, NULL); + goto out; + } + + if (!this->parents) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_VOL_MISCONFIGURED, + NULL); + } + + table = (void *)GF_CALLOC(1, sizeof(*table), gf_ioc_mt_ioc_table_t); + if (table == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL); + goto out; + } + + table->xl = this; + table->page_size = this->ctx->page_size; + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + GF_OPTION_INIT("cache-size", table->cache_size, size_uint64, out); + + GF_OPTION_INIT("cache-timeout", table->cache_timeout, int32, out); + + GF_OPTION_INIT("min-file-size", table->min_file_size, size_uint64, out); + + GF_OPTION_INIT("max-file-size", table->max_file_size, size_uint64, out); + + if (!check_cache_size_ok(this, table->cache_size)) { + ret = -1; + goto out; + } + + INIT_LIST_HEAD(&table->priority_list); + table->max_pri = 1; + data = dict_get(xl_options, "priority"); + if (data) { + char *option_list = data_to_str(data); + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list(option_list, + &table->priority_list); + + if (table->max_pri == -1) { + goto out; + } + } + table->max_pri++; + + INIT_LIST_HEAD(&table->inodes); + + if ((table->max_file_size <= UINT64_MAX) && + (table->min_file_size > table->max_file_size)) { + gf_smsg("io-cache", GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT, + "minimum-size=%" PRIu64, table->min_file_size, + "maximum-size=%" PRIu64, table->max_file_size, NULL); + goto out; + } + + table->inode_lru = GF_CALLOC(table->max_pri, sizeof(struct list_head), + gf_ioc_mt_list_head); + if (table->inode_lru == NULL) { + goto out; + } + + for (index = 0; index < (table->max_pri); index++) + INIT_LIST_HEAD(&table->inode_lru[index]); + + this->local_pool = mem_pool_new(ioc_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_CREATE_MEM_POOL_FAILED, NULL); + goto out; + } + + pthread_mutex_init(&table->table_lock, NULL); + this->private = table; + + num_pages = (table->cache_size / table->page_size) + + ((table->cache_size % table->page_size) ? 1 : 0); + + table->mem_pool = mem_pool_new(rbthash_entry_t, num_pages); + if (!table->mem_pool) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, + IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, NULL); + goto out; + } + + ret = 0; + + ctx = this->ctx; + ioc_log2_page_size = log_base2(ctx->page_size); out: - if (ret == -1) { - if (table != NULL) { - GF_FREE (table->inode_lru); - GF_FREE (table); - } + if (ret == -1) { + if (table != NULL) { + GF_FREE(table->inode_lru); + GF_FREE(table); } + } - return ret; + return ret; } void -ioc_page_waitq_dump (ioc_page_t *page, char *prefix) +ioc_page_waitq_dump(ioc_page_t *page, char *prefix) { - ioc_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - int32_t i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - - trav = page->waitq; - - while (trav) { - frame = trav->data; - sprintf (key, "waitq.frame[%d]", i++); - gf_proc_dump_write (key, "%"PRId64, frame->root->unique); - - trav = trav->next; - } + ioc_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + trav = page->waitq; + + while (trav) { + frame = trav->data; + sprintf(key, "waitq.frame[%d]", i++); + gf_proc_dump_write(key, "%" PRId64, frame->root->unique); + + trav = trav->next; + } } void -__ioc_inode_waitq_dump (ioc_inode_t *ioc_inode, char *prefix) +__ioc_inode_waitq_dump(ioc_inode_t *ioc_inode, char *prefix) { - ioc_waitq_t *trav = NULL; - ioc_page_t *page = NULL; - int32_t i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; + ioc_waitq_t *trav = NULL; + ioc_page_t *page = NULL; + int32_t i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; - trav = ioc_inode->waitq; + trav = ioc_inode->waitq; - while (trav) { - page = trav->data; + while (trav) { + page = trav->data; - sprintf (key, "cache-validation-waitq.page[%d].offset", i++); - gf_proc_dump_write (key, "%"PRId64, page->offset); + sprintf(key, "cache-validation-waitq.page[%d].offset", i++); + gf_proc_dump_write(key, "%" PRId64, page->offset); - trav = trav->next; - } + trav = trav->next; + } } void -__ioc_page_dump (ioc_page_t *page, char *prefix) +__ioc_page_dump(ioc_page_t *page, char *prefix) { + int ret = -1; - ioc_page_lock (page); - { - gf_proc_dump_write ("offset", "%"PRId64, page->offset); - gf_proc_dump_write ("size", "%"PRId64, page->size); - gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); - gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); - ioc_page_waitq_dump (page, prefix); - } - ioc_page_unlock (page); + if (!page) + return; + /* ioc_page_lock can be used to hold the mutex. But in statedump + * its better to use trylock to avoid deadlocks. + */ + ret = pthread_mutex_trylock(&page->page_lock); + if (ret) + goto out; + { + gf_proc_dump_write("offset", "%" PRId64, page->offset); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size); + gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no"); + ioc_page_waitq_dump(page, prefix); + } + pthread_mutex_unlock(&page->page_lock); + +out: + if (ret && page) + gf_proc_dump_write("Unable to dump the page information", + "(Lock acquisition failed) %p", page); + + return; } void -__ioc_cache_dump (ioc_inode_t *ioc_inode, char *prefix) +__ioc_cache_dump(ioc_inode_t *ioc_inode, char *prefix) { - off_t offset = 0; - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - int i = 0; - struct tm *tm = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char timestr[256] = {0, }; - - if ((ioc_inode == NULL) || (prefix == NULL)) { - goto out; - } - - table = ioc_inode->table; - - tm = localtime (&ioc_inode->cache.tv.tv_sec); - strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm); - snprintf (timestr + strlen (timestr), 256 - strlen (timestr), - ".%"GF_PRI_SUSECONDS, ioc_inode->cache.tv.tv_usec); - - gf_proc_dump_write ("last-cache-validation-time", "%s", timestr); - - for (offset = 0; offset < ioc_inode->ia_size; - offset += table->page_size) { - page = __ioc_page_get (ioc_inode, offset); - if (page == NULL) { - continue; - } - - sprintf (key, "inode.cache.page[%d]", i++); - __ioc_page_dump (page, key); - } + off_t offset = 0; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int i = 0; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char timestr[GF_TIMESTR_SIZE] = { + 0, + }; + + if ((ioc_inode == NULL) || (prefix == NULL)) { + goto out; + } + + table = ioc_inode->table; + + if (ioc_inode->cache.last_revalidate) { + gf_time_fmt(timestr, sizeof timestr, ioc_inode->cache.last_revalidate, + gf_timefmt_FT); + + gf_proc_dump_write("last-cache-validation-time", "%s", timestr); + } + + for (offset = 0; offset < ioc_inode->ia_size; offset += table->page_size) { + page = __ioc_page_get(ioc_inode, offset); + if (page == NULL) { + continue; + } + + sprintf(key, "inode.cache.page[%d]", i++); + __ioc_page_dump(page, key); + } out: - return; + return; } - -void -ioc_inode_dump (ioc_inode_t *ioc_inode, char *prefix) +int +ioc_inode_dump(xlator_t *this, inode_t *inode) { + char *path = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + gf_boolean_t section_added = _gf_false; + char uuid_str[64] = { + 0, + }; + + if (this == NULL || inode == NULL) + goto out; + + gf_proc_dump_build_key(key_prefix, "io-cache", "inode"); + + inode_ctx_get(inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (ioc_inode == NULL) + goto out; + + /* Similar to ioc_page_dump function its better to use + * pthread_mutex_trylock and not to use gf_log in statedump + * to avoid deadlocks. + */ + ret = pthread_mutex_trylock(&ioc_inode->inode_lock); + if (ret) + goto out; + + { + if (gf_uuid_is_null(ioc_inode->inode->gfid)) + goto unlock; + + gf_proc_dump_add_section("%s", key_prefix); + section_added = _gf_true; + + __inode_path(ioc_inode->inode, NULL, &path); + + gf_proc_dump_write("inode.weight", "%d", ioc_inode->weight); + + if (path) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } + + gf_proc_dump_write("uuid", "%s", + uuid_utoa_r(ioc_inode->inode->gfid, uuid_str)); + __ioc_cache_dump(ioc_inode, key_prefix); + __ioc_inode_waitq_dump(ioc_inode, key_prefix); + } +unlock: + pthread_mutex_unlock(&ioc_inode->inode_lock); - if ((ioc_inode == NULL) || (prefix == NULL)) { - goto out; - } - - ioc_inode_lock (ioc_inode); - { - gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight); - __ioc_cache_dump (ioc_inode, prefix); - __ioc_inode_waitq_dump (ioc_inode, prefix); - } - ioc_inode_unlock (ioc_inode); out: - return; + if (ret && ioc_inode) { + if (section_added == _gf_false) + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("Unable to print the status of ioc_inode", + "(Lock acquisition failed) %s", + uuid_utoa(inode->gfid)); + } + return ret; } int -ioc_priv_dump (xlator_t *this) +ioc_priv_dump(xlator_t *this) { - ioc_table_t *priv = NULL; - ioc_inode_t *ioc_inode = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - - if (!this || !this->private) - goto out; - - priv = this->private; - gf_proc_dump_build_key (key_prefix, "xlator.performance.io-cache", - "priv"); - gf_proc_dump_add_section (key_prefix); - - ioc_table_lock (priv); - { - gf_proc_dump_write ("page_size", "%ld", priv->page_size); - gf_proc_dump_write ("cache_size", "%ld", priv->cache_size); - gf_proc_dump_write ("cache_used", "%ld", priv->cache_used); - gf_proc_dump_write ("inode_count", "%u", priv->inode_count); - gf_proc_dump_write ("cache_timeout", "%u", priv->cache_timeout); - gf_proc_dump_write ("min-file-size", "%u", priv->min_file_size); - gf_proc_dump_write ("max-file-size", "%u", priv->max_file_size); - - list_for_each_entry (ioc_inode, &priv->inodes, inode_list) { - ioc_inode_dump (ioc_inode, key_prefix); - } - } - ioc_table_unlock (priv); + ioc_table_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + int ret = -1; + gf_boolean_t add_section = _gf_false; + + if (!this || !this->private) + goto out; + + priv = this->private; + + gf_proc_dump_build_key(key_prefix, "io-cache", "priv"); + gf_proc_dump_add_section("%s", key_prefix); + add_section = _gf_true; + + ret = pthread_mutex_trylock(&priv->table_lock); + if (ret) + goto out; + { + gf_proc_dump_write("page_size", "%" PRIu64, priv->page_size); + gf_proc_dump_write("cache_size", "%" PRIu64, priv->cache_size); + gf_proc_dump_write("cache_used", "%" PRIu64, priv->cache_used); + gf_proc_dump_write("inode_count", "%u", priv->inode_count); + gf_proc_dump_write("cache_timeout", "%u", priv->cache_timeout); + gf_proc_dump_write("min-file-size", "%" PRIu64, priv->min_file_size); + gf_proc_dump_write("max-file-size", "%" PRIu64, priv->max_file_size); + } + pthread_mutex_unlock(&priv->table_lock); out: - return 0; + if (ret && priv) { + if (!add_section) { + gf_proc_dump_build_key(key_prefix, + "xlator." + "performance.io-cache", + "priv"); + gf_proc_dump_add_section("%s", key_prefix); + } + gf_proc_dump_write( + "Unable to dump the state of private " + "structure of io-cache xlator", + "(Lock " + "acquisition failed) %s", + this->name); + } + + return 0; } /* @@ -1893,86 +2088,144 @@ out: * */ void -fini (xlator_t *this) +fini(xlator_t *this) { - ioc_table_t *table = NULL; - - table = this->private; + ioc_table_t *table = NULL; + struct ioc_priority *curr = NULL, *tmp = NULL; - if (table == NULL) - return; + table = this->private; - if (table->mem_pool != NULL) { - mem_pool_destroy (table->mem_pool); - table->mem_pool = NULL; - } - - pthread_mutex_destroy (&table->table_lock); - GF_FREE (table); - - this->private = NULL; + if (table == NULL) return; + + this->private = NULL; + + if (table->mem_pool != NULL) { + mem_pool_destroy(table->mem_pool); + table->mem_pool = NULL; + } + + list_for_each_entry_safe(curr, tmp, &table->priority_list, list) + { + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); + } + + /* inode_lru and inodes list can be empty in case fini() is + * called soon after init()? Hence commenting the below asserts. + */ + /*for (i = 0; i < table->max_pri; i++) { + GF_ASSERT (list_empty (&table->inode_lru[i])); + } + + GF_ASSERT (list_empty (&table->inodes)); + */ + pthread_mutex_destroy(&table->table_lock); + GF_FREE(table); + + this->private = NULL; + return; } struct xlator_fops fops = { - .open = ioc_open, - .create = ioc_create, - .readv = ioc_readv, - .writev = ioc_writev, - .truncate = ioc_truncate, - .ftruncate = ioc_ftruncate, - .lookup = ioc_lookup, - .lk = ioc_lk, - .setattr = ioc_setattr, - .mknod = ioc_mknod + .open = ioc_open, + .create = ioc_create, + .readv = ioc_readv, + .writev = ioc_writev, + .truncate = ioc_truncate, + .ftruncate = ioc_ftruncate, + .lookup = ioc_lookup, + .lk = ioc_lk, + .setattr = ioc_setattr, + .mknod = ioc_mknod, + + .readdirp = ioc_readdirp, + .discard = ioc_discard, + .zerofill = ioc_zerofill, }; - struct xlator_dumpops dumpops = { - .priv = ioc_priv_dump, + .priv = ioc_priv_dump, + .inodectx = ioc_inode_dump, }; struct xlator_cbks cbks = { - .forget = ioc_forget, - .release = ioc_release + .forget = ioc_forget, + .release = ioc_release, + .invalidate = ioc_invalidate, }; struct volume_options options[] = { - { .key = {"priority"}, - .type = GF_OPTION_TYPE_ANY, - .default_value = "", - .description = "Assigns priority to filenames with specific " - "patterns so that when a page needs to be ejected " - "out of the cache, the page of a file whose " - "priority is the lowest will be ejected earlier" - }, - { .key = {"cache-timeout", "force-revalidate-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 60, - .default_value = "1", - .description = "The cached data for a file will be retained till " - "'cache-refresh-timeout' seconds, after which data " - "re-validation is performed." - }, - { .key = {"cache-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 4 * GF_UNIT_MB, - .max = 32 * GF_UNIT_GB, - .default_value = "32MB", - .description = "Size of the read cache." - }, - { .key = {"min-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .default_value = "0", - .description = "Minimum file size which would be cached by the " - "io-cache translator." - }, - { .key = {"max-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .default_value = "0", - .description = "Maximum file size which would be cached by the " - "io-cache translator." - }, - { .key = {NULL} }, + { + .key = {"io-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable io-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"priority"}, + .type = GF_OPTION_TYPE_PRIORITY_LIST, + .default_value = "", + .description = "Assigns priority to filenames with specific " + "patterns so that when a page needs to be ejected " + "out of the cache, the page of a file whose " + "priority is the lowest will be ejected earlier", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"cache-timeout", "force-revalidate-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60, + .default_value = "1", + .description = "The cached data for a file will be retained for " + "'cache-refresh-timeout' seconds, after which data " + "re-validation is performed.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4 * GF_UNIT_MB, + .max = INFINITY, + .default_value = "32MB", + .description = "Size of the read cache.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Minimum file size which would be cached by the " + "io-cache translator.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "0", + .description = "Maximum file size which would be cached by the " + "io-cache translator.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-cache"}, + .description = "Enable/Disable io cache translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "io-cache", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h index eec24f143ba..14923c75edc 100644 --- a/xlators/performance/io-cache/src/io-cache.h +++ b/xlators/performance/io-cache/src/io-cache.h @@ -1,46 +1,29 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __IO_CACHE_H #define __IO_CACHE_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include <sys/types.h> -#include "compat-errno.h" - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" -#include "call-stub.h" -#include "rbthash.h" -#include "hashfn.h" +#include <glusterfs/compat-errno.h> + +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/rbthash.h> #include <sys/time.h> #include <fnmatch.h> +#include "io-cache-messages.h" -#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ -#define IOC_CACHE_SIZE (32 * 1024 * 1024) +#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ +#define IOC_CACHE_SIZE (32 * 1024 * 1024) #define IOC_PAGE_TABLE_BUCKET_COUNT 1 struct ioc_table; @@ -49,9 +32,9 @@ struct ioc_page; struct ioc_inode; struct ioc_priority { - struct list_head list; - char *pattern; - uint32_t priority; + struct list_head list; + char *pattern; + uint32_t priority; }; /* @@ -62,10 +45,10 @@ struct ioc_priority { * @data: pointer to the frame which is waiting */ struct ioc_waitq { - struct ioc_waitq *next; - void *data; - off_t pending_offset; - size_t pending_size; + struct ioc_waitq *next; + void *data; + off_t pending_offset; + size_t pending_size; }; /* @@ -73,40 +56,41 @@ struct ioc_waitq { * */ struct ioc_fill { - struct list_head list; /* list of ioc_fill structures of a frame */ - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct list_head list; /* list of ioc_fill structures of a frame */ + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; struct ioc_local { - mode_t mode; - int32_t flags; - int32_t wbflags; - loc_t file_loc; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - struct list_head fill_list; /* list of ioc_fill structures */ - off_t pending_offset; /* - * offset from this frame should - * continue - */ - size_t pending_size; /* - * size of data this frame is waiting - * on - */ - struct ioc_inode *inode; - int32_t wait_count; - pthread_mutex_t local_lock; - struct ioc_waitq *waitq; - void *stub; - fd_t *fd; - int32_t need_xattr; - dict_t *xattr_req; + mode_t mode; + int32_t flags; + loc_t file_loc; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + struct list_head fill_list; /* list of ioc_fill structures */ + off_t pending_offset; /* + * offset from this frame should + * continue + */ + size_t pending_size; /* + * size of data this frame is waiting + * on + */ + struct ioc_inode *inode; + int32_t wait_count; + pthread_mutex_t local_lock; + struct ioc_waitq *waitq; + void *stub; + fd_t *fd; + struct iovec *vector; + struct iobref *iobref; + int32_t need_xattr; + dict_t *xattr_req; }; /* @@ -114,69 +98,69 @@ struct ioc_local { * */ struct ioc_page { - struct list_head page_lru; - struct ioc_inode *inode; /* inode this page belongs to */ - struct ioc_priority *priority; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ioc_waitq *waitq; - struct iobref *iobref; - pthread_mutex_t page_lock; + struct list_head page_lru; + struct ioc_inode *inode; /* inode this page belongs to */ + struct ioc_priority *priority; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ioc_waitq *waitq; + struct iobref *iobref; + pthread_mutex_t page_lock; + int32_t op_errno; + char stale; }; struct ioc_cache { - rbthash_table_t *page_table; - struct list_head page_lru; - time_t mtime; /* - * seconds component of file mtime - */ - time_t mtime_nsec; /* - * nanosecond component of file mtime - */ - struct timeval tv; /* - * time-stamp at last re-validate - */ + rbthash_table_t *page_table; + struct list_head page_lru; + time_t mtime; /* + * seconds component of file mtime + */ + time_t mtime_nsec; /* + * nanosecond component of file mtime + */ + time_t last_revalidate; /* timestamp at last re-validate */ }; struct ioc_inode { - struct ioc_table *table; - off_t ia_size; - struct ioc_cache cache; - struct list_head inode_list; /* - * list of inodes, maintained by - * io-cache translator - */ - struct list_head inode_lru; - struct ioc_waitq *waitq; - pthread_mutex_t inode_lock; - uint32_t weight; /* - * weight of the inode, increases - * on each read - */ - inode_t *inode; + struct ioc_table *table; + off_t ia_size; + struct ioc_cache cache; + struct list_head inode_list; /* + * list of inodes, maintained by + * io-cache translator + */ + struct list_head inode_lru; + struct ioc_waitq *waitq; + pthread_mutex_t inode_lock; + uint32_t weight; /* + * weight of the inode, increases + * on each read + */ + inode_t *inode; }; struct ioc_table { - uint64_t page_size; - uint64_t cache_size; - uint64_t cache_used; - uint64_t min_file_size; - uint64_t max_file_size; - struct list_head inodes; /* list of inodes cached */ - struct list_head active; - struct list_head *inode_lru; - struct list_head priority_list; - int32_t readv_count; - pthread_mutex_t table_lock; - xlator_t *xl; - uint32_t inode_count; - int32_t cache_timeout; - int32_t max_pri; - struct mem_pool *mem_pool; + uint64_t page_size; + uint64_t cache_size; + uint64_t cache_used; + uint64_t min_file_size; + uint64_t max_file_size; + struct list_head inodes; /* list of inodes cached */ + struct list_head active; + struct list_head *inode_lru; + struct list_head priority_list; + int32_t readv_count; + pthread_mutex_t table_lock; + xlator_t *xl; + uint32_t inode_count; + int32_t cache_timeout; + int32_t max_pri; + struct mem_pool *mem_pool; }; typedef struct ioc_table ioc_table_t; @@ -187,156 +171,136 @@ typedef struct ioc_waitq ioc_waitq_t; typedef struct ioc_fill ioc_fill_t; void * -str_to_ptr (char *string); +str_to_ptr(char *string); char * -ptr_to_str (void *ptr); +ptr_to_str(void *ptr); int32_t -ioc_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, - struct iobref *iobref); +ioc_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata); ioc_page_t * -__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset); ioc_page_t * -__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset); +__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset); void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset); +ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset); void -__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size); +__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size); ioc_waitq_t * -__ioc_page_wakeup (ioc_page_t *page); +__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno); void -ioc_page_flush (ioc_page_t *page); +ioc_page_flush(ioc_page_t *page); ioc_waitq_t * -__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno); +__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno); void -ioc_frame_return (call_frame_t *frame); +ioc_frame_return(call_frame_t *frame); void -ioc_waitq_return (ioc_waitq_t *waitq); +ioc_waitq_return(ioc_waitq_t *waitq); int32_t -ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size); - -#define ioc_inode_lock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "locked inode(%p)", ioc_inode); \ - pthread_mutex_lock (&ioc_inode->inode_lock); \ - } while (0) - - -#define ioc_inode_unlock(ioc_inode) \ - do { \ - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked inode(%p)", ioc_inode); \ - pthread_mutex_unlock (&ioc_inode->inode_lock); \ - } while (0) - - -#define ioc_table_lock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "locked table(%p)", table); \ - pthread_mutex_lock (&table->table_lock); \ - } while (0) - - -#define ioc_table_unlock(table) \ - do { \ - gf_log (table->xl->name, GF_LOG_TRACE, \ - "unlocked table(%p)", table); \ - pthread_mutex_unlock (&table->table_lock); \ - } while (0) - - -#define ioc_local_lock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "locked local(%p)", local); \ - pthread_mutex_lock (&local->local_lock); \ - } while (0) - - -#define ioc_local_unlock(local) \ - do { \ - gf_log (local->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked local(%p)", local); \ - pthread_mutex_unlock (&local->local_lock); \ - } while (0) - - -#define ioc_page_lock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "locked page(%p)", page); \ - pthread_mutex_lock (&page->page_lock); \ - } while (0) - - -#define ioc_page_unlock(page) \ - do { \ - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, \ - "unlocked page(%p)", page); \ - pthread_mutex_unlock (&page->page_lock); \ - } while (0) - - -static inline uint64_t -time_elapsed (struct timeval *now, - struct timeval *then) -{ - uint64_t sec = now->tv_sec - then->tv_sec; - - if (sec) - return sec; - - return 0; -} +ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, size_t size, + int32_t op_errno); + +#define ioc_inode_lock(ioc_inode) \ + do { \ + gf_msg_trace(ioc_inode->table->xl->name, 0, "locked inode(%p)", \ + ioc_inode); \ + pthread_mutex_lock(&ioc_inode->inode_lock); \ + } while (0) + +#define ioc_inode_unlock(ioc_inode) \ + do { \ + gf_msg_trace(ioc_inode->table->xl->name, 0, "unlocked inode(%p)", \ + ioc_inode); \ + pthread_mutex_unlock(&ioc_inode->inode_lock); \ + } while (0) + +#define ioc_table_lock(table) \ + do { \ + gf_msg_trace(table->xl->name, 0, "locked table(%p)", table); \ + pthread_mutex_lock(&table->table_lock); \ + } while (0) + +#define ioc_table_unlock(table) \ + do { \ + gf_msg_trace(table->xl->name, 0, "unlocked table(%p)", table); \ + pthread_mutex_unlock(&table->table_lock); \ + } while (0) + +#define ioc_local_lock(local) \ + do { \ + gf_msg_trace(local->inode->table->xl->name, 0, "locked local(%p)", \ + local); \ + pthread_mutex_lock(&local->local_lock); \ + } while (0) + +#define ioc_local_unlock(local) \ + do { \ + gf_msg_trace(local->inode->table->xl->name, 0, "unlocked local(%p)", \ + local); \ + pthread_mutex_unlock(&local->local_lock); \ + } while (0) + +#define ioc_page_lock(page) \ + do { \ + gf_msg_trace(page->inode->table->xl->name, 0, "locked page(%p)", \ + page); \ + pthread_mutex_lock(&page->page_lock); \ + } while (0) + +#define ioc_page_unlock(page) \ + do { \ + gf_msg_trace(page->inode->table->xl->name, 0, "unlocked page(%p)", \ + page); \ + pthread_mutex_unlock(&page->page_lock); \ + } while (0) ioc_inode_t * -ioc_inode_search (ioc_table_t *table, inode_t *inode); +ioc_inode_search(ioc_table_t *table, inode_t *inode); void -ioc_inode_destroy (ioc_inode_t *ioc_inode); +ioc_inode_destroy(ioc_inode_t *ioc_inode); + +int32_t +ioc_inode_update(xlator_t *this, inode_t *inode, char *path, + struct iatt *iabuf); ioc_inode_t * -ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight); +ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight); int64_t -__ioc_page_destroy (ioc_page_t *page); +__ioc_page_destroy(ioc_page_t *page); int64_t -__ioc_inode_flush (ioc_inode_t *ioc_inode); +__ioc_inode_flush(ioc_inode_t *ioc_inode); void -ioc_inode_flush (ioc_inode_t *ioc_inode); +ioc_inode_flush(ioc_inode_t *ioc_inode); void -ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct iatt *stbuf); +ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iatt *stbuf); int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf); +ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf); int32_t -ioc_prune (ioc_table_t *table); +ioc_prune(ioc_table_t *table); int32_t -ioc_need_prune (ioc_table_t *table); +ioc_need_prune(ioc_table_t *table); -inline uint32_t -ioc_hashfn (void *data, int len); #endif /* __IO_CACHE_H */ diff --git a/xlators/performance/io-cache/src/ioc-inode.c b/xlators/performance/io-cache/src/ioc-inode.c index 4c48c41d1b7..97767d85285 100644 --- a/xlators/performance/io-cache/src/ioc-inode.c +++ b/xlators/performance/io-cache/src/ioc-inode.c @@ -1,27 +1,13 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "io-cache.h" #include "ioc-mem-types.h" @@ -33,144 +19,140 @@ extern int ioc_log2_page_size; * */ void * -str_to_ptr (char *string) +str_to_ptr(char *string) { - void *ptr = NULL; + void *ptr = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", string, out); + GF_VALIDATE_OR_GOTO("io-cache", string, out); - ptr = (void *)strtoul (string, NULL, 16); + ptr = (void *)strtoul(string, NULL, 16); out: - return ptr; + return ptr; } - /* * ptr_to_str - convert a pointer to string * @ptr: pointer * */ char * -ptr_to_str (void *ptr) +ptr_to_str(void *ptr) { - int ret = 0; - char *str = NULL; + int ret = 0; + char *str = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ptr, out); + GF_VALIDATE_OR_GOTO("io-cache", ptr, out); - ret = gf_asprintf (&str, "%p", ptr); - if (-1 == ret) { - gf_log ("io-cache", GF_LOG_WARNING, - "asprintf failed while converting ptr to str"); - str = NULL; - goto out; - } + ret = gf_asprintf(&str, "%p", ptr); + if (-1 == ret) { + gf_smsg("io-cache", GF_LOG_WARNING, 0, + IO_CACHE_MSG_STR_COVERSION_FAILED, NULL); + str = NULL; + goto out; + } out: - return str; + return str; } - void -ioc_inode_wakeup (call_frame_t *frame, ioc_inode_t *ioc_inode, - struct iatt *stbuf) +ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode, + struct iatt *stbuf) { - ioc_waitq_t *waiter = NULL, *waited = NULL; - ioc_waitq_t *page_waitq = NULL; - int8_t cache_still_valid = 1; - ioc_local_t *local = NULL; - int8_t need_fault = 0; - ioc_page_t *waiter_page = NULL; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); - - if (ioc_inode == NULL) { - local->op_ret = -1; - local->op_errno = EINVAL; - gf_log (frame->this->name, GF_LOG_WARNING, "ioc_inode is NULL"); - goto out; - } - - ioc_inode_lock (ioc_inode); - { - waiter = ioc_inode->waitq; - ioc_inode->waitq = NULL; - } - ioc_inode_unlock (ioc_inode); - - if (stbuf) - cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf); - else - cache_still_valid = 0; - + ioc_waitq_t *waiter = NULL, *waited = NULL; + ioc_waitq_t *page_waitq = NULL; + int8_t cache_still_valid = 1; + ioc_local_t *local = NULL; + int8_t need_fault = 0; + ioc_page_t *waiter_page = NULL; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (ioc_inode == NULL) { + local->op_ret = -1; + local->op_errno = EINVAL; + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_INODE_NULL, + NULL); + goto out; + } + + if (stbuf) + cache_still_valid = ioc_cache_still_valid(ioc_inode, stbuf); + else + cache_still_valid = 0; + + ioc_inode_lock(ioc_inode); + { + waiter = ioc_inode->waitq; if (!waiter) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cache validate called without any " - "page waiting to be validated"); + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_PAGE_WAIT_VALIDATE, NULL); + + ioc_inode_unlock(ioc_inode); + goto out; } while (waiter) { - waiter_page = waiter->data; - page_waitq = NULL; - - if (waiter_page) { - if (cache_still_valid) { - /* cache valid, wake up page */ - ioc_inode_lock (ioc_inode); - { - page_waitq = - __ioc_page_wakeup (waiter_page); - } - ioc_inode_unlock (ioc_inode); - if (page_waitq) - ioc_waitq_return (page_waitq); - } else { - /* cache invalid, generate page fault and set - * page->ready = 0, to avoid double faults - */ - ioc_inode_lock (ioc_inode); - { - if (waiter_page->ready) { - waiter_page->ready = 0; - need_fault = 1; - } else { - gf_log (frame->this->name, - GF_LOG_TRACE, - "validate frame(%p) is " - "waiting for in-transit" - " page = %p", frame, - waiter_page); - } - } - ioc_inode_unlock (ioc_inode); - - if (need_fault) { - need_fault = 0; - ioc_page_fault (ioc_inode, frame, - local->fd, - waiter_page->offset); - } - } + waiter_page = waiter->data; + ioc_inode->waitq = waiter->next; + page_waitq = NULL; + + if (waiter_page) { + if (cache_still_valid) { + /* cache valid, wake up page */ + page_waitq = __ioc_page_wakeup(waiter_page, + waiter_page->op_errno); + if (page_waitq) { + ioc_inode_unlock(ioc_inode); + ioc_waitq_return(page_waitq); + ioc_inode_lock(ioc_inode); + } + } else { + /* cache invalid, generate page fault and set + * page->ready = 0, to avoid double faults + */ + if (waiter_page->ready) { + waiter_page->ready = 0; + need_fault = 1; + } else { + gf_msg_trace(frame->this->name, 0, + "validate " + "frame(%p) is " + "waiting for " + "in-transit" + " page = %p", + frame, waiter_page); + } + + if (need_fault) { + need_fault = 0; + ioc_inode_unlock(ioc_inode); + ioc_page_fault(ioc_inode, frame, local->fd, + waiter_page->offset); + ioc_inode_lock(ioc_inode); + } } + } - waited = waiter; - waiter = waiter->next; + waited = waiter; + waiter = ioc_inode->waitq; - waited->data = NULL; - GF_FREE (waited); + waited->data = NULL; + GF_FREE(waited); } + } + ioc_inode_unlock(ioc_inode); out: - return; + return; } - /* - * ioc_inode_update - create a new ioc_inode_t structure and add it to + * ioc_inode_create - create a new ioc_inode_t structure and add it to * the table table. fill in the fields which are derived * from inode_t corresponding to the file * @@ -180,40 +162,37 @@ out: * not for external reference */ ioc_inode_t * -ioc_inode_update (ioc_table_t *table, inode_t *inode, uint32_t weight) +ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight) { - ioc_inode_t *ioc_inode = NULL; + ioc_inode_t *ioc_inode = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + GF_VALIDATE_OR_GOTO("io-cache", table, out); - ioc_inode = GF_CALLOC (1, sizeof (ioc_inode_t), gf_ioc_mt_ioc_inode_t); - if (ioc_inode == NULL) { - goto out; - } + ioc_inode = GF_CALLOC(1, sizeof(ioc_inode_t), gf_ioc_mt_ioc_inode_t); + if (ioc_inode == NULL) { + goto out; + } - ioc_inode->inode = inode; - ioc_inode->table = table; - INIT_LIST_HEAD (&ioc_inode->cache.page_lru); - pthread_mutex_init (&ioc_inode->inode_lock, NULL); - ioc_inode->weight = weight; - - ioc_table_lock (table); - { - table->inode_count++; - list_add (&ioc_inode->inode_list, &table->inodes); - list_add_tail (&ioc_inode->inode_lru, - &table->inode_lru[weight]); - } - ioc_table_unlock (table); + ioc_inode->inode = inode; + ioc_inode->table = table; + INIT_LIST_HEAD(&ioc_inode->cache.page_lru); + pthread_mutex_init(&ioc_inode->inode_lock, NULL); + ioc_inode->weight = weight; + + ioc_table_lock(table); + { + table->inode_count++; + list_add(&ioc_inode->inode_list, &table->inodes); + list_add_tail(&ioc_inode->inode_lru, &table->inode_lru[weight]); + } + ioc_table_unlock(table); - gf_log (table->xl->name, GF_LOG_TRACE, - "adding to inode_lru[%d]", weight); + gf_msg_trace(table->xl->name, 0, "adding to inode_lru[%d]", weight); out: - return ioc_inode; + return ioc_inode; } - /* * ioc_inode_destroy - destroy an ioc_inode_t object. * @@ -222,27 +201,27 @@ out: * to be called only from ioc_forget. */ void -ioc_inode_destroy (ioc_inode_t *ioc_inode) +ioc_inode_destroy(ioc_inode_t *ioc_inode) { - ioc_table_t *table = NULL; + ioc_table_t *table = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; + table = ioc_inode->table; - ioc_table_lock (table); - { - table->inode_count--; - list_del (&ioc_inode->inode_list); - list_del (&ioc_inode->inode_lru); - } - ioc_table_unlock (table); + ioc_table_lock(table); + { + table->inode_count--; + list_del(&ioc_inode->inode_list); + list_del(&ioc_inode->inode_lru); + } + ioc_table_unlock(table); - ioc_inode_flush (ioc_inode); - rbthash_table_destroy (ioc_inode->cache.page_table); + ioc_inode_flush(ioc_inode); + rbthash_table_destroy(ioc_inode->cache.page_table); - pthread_mutex_destroy (&ioc_inode->inode_lock); - GF_FREE (ioc_inode); + pthread_mutex_destroy(&ioc_inode->inode_lock); + GF_FREE(ioc_inode); out: - return; + return; } diff --git a/xlators/performance/io-cache/src/ioc-mem-types.h b/xlators/performance/io-cache/src/ioc-mem-types.h index 421485e2698..20c9a12021e 100644 --- a/xlators/performance/io-cache/src/ioc-mem-types.h +++ b/xlators/performance/io-cache/src/ioc-mem-types.h @@ -1,39 +1,29 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __IOC_MT_H__ #define __IOC_MT_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_ioc_mem_types_ { - gf_ioc_mt_iovec = gf_common_mt_end + 1, - gf_ioc_mt_ioc_table_t, - gf_ioc_mt_char, - gf_ioc_mt_ioc_local_t, - gf_ioc_mt_ioc_waitq_t, - gf_ioc_mt_ioc_priority, - gf_ioc_mt_list_head, - gf_ioc_mt_call_pool_t, - gf_ioc_mt_ioc_inode_t, - gf_ioc_mt_ioc_fill_t, - gf_ioc_mt_ioc_newpage_t, - gf_ioc_mt_end + gf_ioc_mt_iovec = gf_common_mt_end + 1, + gf_ioc_mt_ioc_table_t, + gf_ioc_mt_char, + gf_ioc_mt_ioc_waitq_t, + gf_ioc_mt_ioc_priority, + gf_ioc_mt_list_head, + gf_ioc_mt_call_pool_t, + gf_ioc_mt_ioc_inode_t, + gf_ioc_mt_ioc_fill_t, + gf_ioc_mt_ioc_newpage_t, + gf_ioc_mt_end }; #endif diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c index 9afaf306235..84b1ae6cb20 100644 --- a/xlators/performance/io-cache/src/page.c +++ b/xlators/performance/io-cache/src/page.c @@ -1,97 +1,80 @@ /* - Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-cache.h" #include "ioc-mem-types.h" #include <assert.h> #include <sys/time.h> - +#include "io-cache-messages.h" char -ioc_empty (struct ioc_cache *cache) +ioc_empty(struct ioc_cache *cache) { - char is_empty = -1; + char is_empty = -1; - GF_VALIDATE_OR_GOTO ("io-cache", cache, out); + GF_VALIDATE_OR_GOTO("io-cache", cache, out); - is_empty = list_empty (&cache->page_lru); + is_empty = list_empty(&cache->page_lru); out: - return is_empty; + return is_empty; } - ioc_page_t * -__ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset) { - ioc_page_t *page = NULL; - ioc_table_t *table = NULL; - off_t rounded_offset = 0; + ioc_page_t *page = NULL; + ioc_table_t *table = NULL; + off_t rounded_offset = 0; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + table = ioc_inode->table; + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - rounded_offset = floor (offset, table->page_size); + rounded_offset = gf_floor(offset, table->page_size); - page = rbthash_get (ioc_inode->cache.page_table, &rounded_offset, - sizeof (rounded_offset)); + page = rbthash_get(ioc_inode->cache.page_table, &rounded_offset, + sizeof(rounded_offset)); - if (page != NULL) { - /* push the page to the end of the lru list */ - list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); - } + if (page != NULL) { + /* push the page to the end of the lru list */ + list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru); + } out: - return page; + return page; } - ioc_page_t * -ioc_page_get (ioc_inode_t *ioc_inode, off_t offset) +ioc_page_get(ioc_inode_t *ioc_inode, off_t offset) { - ioc_page_t *page = NULL; + ioc_page_t *page = NULL; - if (ioc_inode == NULL) { - goto out; - } + if (ioc_inode == NULL) { + goto out; + } - ioc_inode_lock (ioc_inode); - { - page = __ioc_page_get (ioc_inode, offset); - } - ioc_inode_unlock (ioc_inode); + ioc_inode_lock(ioc_inode); + { + page = __ioc_page_get(ioc_inode, offset); + } + ioc_inode_unlock(ioc_inode); out: - return page; + return page; } - /* * __ioc_page_destroy - * @@ -99,102 +82,108 @@ out: * */ int64_t -__ioc_page_destroy (ioc_page_t *page) +__ioc_page_destroy(ioc_page_t *page) { - int64_t page_size = 0; - - GF_VALIDATE_OR_GOTO ("io-cache", page, out); - - if (page->iobref) - page_size = iobref_size (page->iobref); - - if (page->waitq) { - /* frames waiting on this page, do not destroy this page */ - page_size = -1; - } else { - rbthash_remove (page->inode->cache.page_table, &page->offset, - sizeof (page->offset)); - list_del (&page->page_lru); - - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "destroying page = %p, offset = %"PRId64" " - "&& inode = %p", - page, page->offset, page->inode); - - if (page->vector){ - iobref_unref (page->iobref); - GF_FREE (page->vector); - page->vector = NULL; - } - - page->inode = NULL; + int64_t page_size = 0; + + GF_VALIDATE_OR_GOTO("io-cache", page, out); + + if (page->iobref) + page_size = iobref_size(page->iobref); + + if (page->waitq) { + /* frames waiting on this page, do not destroy this page */ + page_size = -1; + page->stale = 1; + } else { + rbthash_remove(page->inode->cache.page_table, &page->offset, + sizeof(page->offset)); + list_del(&page->page_lru); + + gf_msg_trace(page->inode->table->xl->name, 0, + "destroying page = %p, offset = %" PRId64 + " " + "&& inode = %p", + page, page->offset, page->inode); + + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); + page->vector = NULL; } - if (page_size != -1) { - pthread_mutex_destroy (&page->page_lock); - GF_FREE (page); - } + page->inode = NULL; + } + + if (page_size != -1) { + pthread_mutex_destroy(&page->page_lock); + GF_FREE(page); + } out: - return page_size; + return page_size; } - int64_t -ioc_page_destroy (ioc_page_t *page) +ioc_page_destroy(ioc_page_t *page) { - int64_t ret = 0; + int64_t ret = 0; + struct ioc_inode *inode = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - ioc_inode_lock (page->inode); - { - ret = __ioc_page_destroy (page); - } - ioc_inode_unlock (page->inode); + ioc_inode_lock(page->inode); + { + inode = page->inode; + ret = __ioc_page_destroy(page); + } + ioc_inode_unlock(inode); out: - return ret; + return ret; } int32_t -__ioc_inode_prune (ioc_inode_t *curr, uint64_t *size_pruned, - uint64_t size_to_prune, uint32_t index) +__ioc_inode_prune(ioc_inode_t *curr, uint64_t *size_pruned, + uint64_t size_to_prune, uint32_t index) { - ioc_page_t *page = NULL, *next = NULL; - int32_t ret = 0; - ioc_table_t *table = NULL; + ioc_page_t *page = NULL, *next = NULL; + int32_t ret = 0; + ioc_table_t *table = NULL; - if (curr == NULL) { - goto out; - } + if (curr == NULL) { + goto out; + } - table = curr->table; + table = curr->table; - list_for_each_entry_safe (page, next, &curr->cache.page_lru, page_lru) { - *size_pruned += page->size; - ret = __ioc_page_destroy (page); + list_for_each_entry_safe(page, next, &curr->cache.page_lru, page_lru) + { + *size_pruned += page->size; + ret = __ioc_page_destroy(page); - if (ret != -1) - table->cache_used -= ret; + if (ret != -1) + table->cache_used -= ret; - gf_log (table->xl->name, GF_LOG_TRACE, - "index = %d && table->cache_used = %"PRIu64" && table->" - "cache_size = %"PRIu64, index, table->cache_used, - table->cache_size); + gf_msg_trace(table->xl->name, 0, + "index = %d && " + "table->cache_used = %" PRIu64 + " && table->" + "cache_size = %" PRIu64, + index, table->cache_used, table->cache_size); - if ((*size_pruned) >= size_to_prune) - break; - } + if ((*size_pruned) >= size_to_prune) + break; + } - if (ioc_empty (&curr->cache)) { - list_del_init (&curr->inode_lru); - } + if (ioc_empty(&curr->cache)) { + list_del_init(&curr->inode_lru); + } out: - return 0; + return 0; } /* * ioc_prune - prune the cache. we have a limit to the number of pages we @@ -204,46 +193,44 @@ out: * */ int32_t -ioc_prune (ioc_table_t *table) +ioc_prune(ioc_table_t *table) { - ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; - int32_t index = 0; - uint64_t size_to_prune = 0; - uint64_t size_pruned = 0; + ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; + int32_t index = 0; + uint64_t size_to_prune = 0; + uint64_t size_pruned = 0; + + GF_VALIDATE_OR_GOTO("io-cache", table, out); + + ioc_table_lock(table); + { + size_to_prune = table->cache_used - table->cache_size; + /* take out the least recently used inode */ + for (index = 0; index < table->max_pri; index++) { + list_for_each_entry_safe(curr, next_ioc_inode, + &table->inode_lru[index], inode_lru) + { + /* prune page-by-page for this inode, till + * we reach the equilibrium */ + ioc_inode_lock(curr); + { + __ioc_inode_prune(curr, &size_pruned, size_to_prune, index); + } + ioc_inode_unlock(curr); - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + if (size_pruned >= size_to_prune) + break; + } /* list_for_each_entry_safe (curr...) */ - ioc_table_lock (table); - { - size_to_prune = table->cache_used - table->cache_size; - /* take out the least recently used inode */ - for (index=0; index < table->max_pri; index++) { - list_for_each_entry_safe (curr, next_ioc_inode, - &table->inode_lru[index], - inode_lru) { - /* prune page-by-page for this inode, till - * we reach the equilibrium */ - ioc_inode_lock (curr); - { - __ioc_inode_prune (curr, &size_pruned, - size_to_prune, - index); - } - ioc_inode_unlock (curr); - - if (size_pruned >= size_to_prune) - break; - } /* list_for_each_entry_safe (curr...) */ - - if (size_pruned >= size_to_prune) - break; - } /* for(index=0;...) */ - - } /* ioc_inode_table locked region end */ - ioc_table_unlock (table); + if (size_pruned >= size_to_prune) + break; + } /* for(index=0;...) */ + + } /* ioc_inode_table locked region end */ + ioc_table_unlock(table); out: - return 0; + return 0; } /* @@ -254,47 +241,46 @@ out: * */ ioc_page_t * -__ioc_page_create (ioc_inode_t *ioc_inode, off_t offset) +__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset) { - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - off_t rounded_offset = 0; - ioc_page_t *newpage = NULL; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + off_t rounded_offset = 0; + ioc_page_t *newpage = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); - table = ioc_inode->table; - GF_VALIDATE_OR_GOTO ("io-cache", table, out); + table = ioc_inode->table; + GF_VALIDATE_OR_GOTO("io-cache", table, out); - rounded_offset = floor (offset, table->page_size); + rounded_offset = gf_floor(offset, table->page_size); - newpage = GF_CALLOC (1, sizeof (*newpage), gf_ioc_mt_ioc_newpage_t); - if (newpage == NULL) { - goto out; - } + newpage = GF_CALLOC(1, sizeof(*newpage), gf_ioc_mt_ioc_newpage_t); + if (newpage == NULL) { + goto out; + } - if (!ioc_inode) { - GF_FREE (newpage); - newpage = NULL; - goto out; - } + if (!ioc_inode) { + GF_FREE(newpage); + newpage = NULL; + goto out; + } - newpage->offset = rounded_offset; - newpage->inode = ioc_inode; - pthread_mutex_init (&newpage->page_lock, NULL); + newpage->offset = rounded_offset; + newpage->inode = ioc_inode; + pthread_mutex_init(&newpage->page_lock, NULL); - rbthash_insert (ioc_inode->cache.page_table, newpage, &rounded_offset, - sizeof (rounded_offset)); + rbthash_insert(ioc_inode->cache.page_table, newpage, &rounded_offset, + sizeof(rounded_offset)); - list_add_tail (&newpage->page_lru, &ioc_inode->cache.page_lru); + list_add_tail(&newpage->page_lru, &ioc_inode->cache.page_lru); - page = newpage; + page = newpage; - gf_log ("io-cache", GF_LOG_TRACE, - "returning new page %p", page); + gf_msg_trace("io-cache", 0, "returning new page %p", page); out: - return page; + return page; } /* @@ -307,54 +293,55 @@ out: * */ void -__ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size) +__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size) { - ioc_waitq_t *waitq = NULL; - ioc_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - local = frame->local; - - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); - - if (page == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - gf_log (frame->this->name, GF_LOG_WARNING, - "asked to wait on a NULL page"); - } - - waitq = GF_CALLOC (1, sizeof (*waitq), gf_ioc_mt_ioc_waitq_t); - if (waitq == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) waiting on page = %p, offset=%"PRId64", " - "size=%"GF_PRI_SIZET"", - frame, page, offset, size); - - waitq->data = frame; - waitq->next = page->waitq; - waitq->pending_offset = offset; - waitq->pending_size = size; - page->waitq = waitq; - /* one frame can wait only once on a given page, - * local->wait_count is number of pages a frame is waiting on */ - ioc_local_lock (local); - { - local->wait_count++; - } - ioc_local_unlock (local); + ioc_waitq_t *waitq = NULL; + ioc_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + local = frame->local; + + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (page == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_NULL_PAGE_WAIT, NULL); + goto out; + } + + waitq = GF_CALLOC(1, sizeof(*waitq), gf_ioc_mt_ioc_waitq_t); + if (waitq == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + gf_msg_trace(frame->this->name, 0, + "frame(%p) waiting on page = %p, offset=%" PRId64 + ", " + "size=%" GF_PRI_SIZET "", + frame, page, offset, size); + + waitq->data = frame; + waitq->next = page->waitq; + waitq->pending_offset = offset; + waitq->pending_size = size; + page->waitq = waitq; + /* one frame can wait only once on a given page, + * local->wait_count is number of pages a frame is waiting on */ + ioc_local_lock(local); + { + local->wait_count++; + } + ioc_local_unlock(local); out: - return; + return; } - /* * ioc_cache_still_valid - see if cached pages ioc_inode are still valid * against given stbuf @@ -365,11 +352,11 @@ out: * assumes ioc_inode is locked */ int8_t -ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) +ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf) { - int8_t cache_still_valid = 1; + int8_t cache_still_valid = 1; - GF_VALIDATE_OR_GOTO ("io-cache", ioc_inode, out); + GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out); #if 0 if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) || @@ -377,9 +364,9 @@ ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) cache_still_valid = 0; #else - if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) - || (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec)) - cache_still_valid = 0; + if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) || + (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec)) + cache_still_valid = 0; #endif @@ -392,179 +379,173 @@ ioc_cache_still_valid (ioc_inode_t *ioc_inode, struct iatt *stbuf) #endif out: - return cache_still_valid; + return cache_still_valid; } - void -ioc_waitq_return (ioc_waitq_t *waitq) +ioc_waitq_return(ioc_waitq_t *waitq) { - ioc_waitq_t *trav = NULL; - ioc_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ioc_waitq_t *trav = NULL; + ioc_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ioc_frame_return (frame); - GF_FREE (trav); - } + frame = trav->data; + ioc_frame_return(frame); + GF_FREE(trav); + } } - int -ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) +ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - ioc_local_t *local = NULL; - off_t offset = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_table_t *table = NULL; - ioc_page_t *page = NULL; - int32_t destroy_size = 0; - size_t page_size = 0; - ioc_waitq_t *waitq = NULL; - size_t iobref_page_size = 0; - char zero_filled = 0; - - GF_ASSERT (frame); - - local = frame->local; - GF_ASSERT (local); - - offset = local->pending_offset; - ioc_inode = local->inode; - GF_ASSERT (ioc_inode); + ioc_local_t *local = NULL; + off_t offset = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_table_t *table = NULL; + ioc_page_t *page = NULL; + int32_t destroy_size = 0; + size_t page_size = 0; + ioc_waitq_t *waitq = NULL; + size_t iobref_page_size = 0; + char zero_filled = 0; + + GF_ASSERT(frame); + + local = frame->local; + GF_ASSERT(local); + + offset = local->pending_offset; + ioc_inode = local->inode; + GF_ASSERT(ioc_inode); + + table = ioc_inode->table; + GF_ASSERT(table); + + zero_filled = ((op_ret >= 0) && (stbuf->ia_mtime == 0)); + + ioc_inode_lock(ioc_inode); + { + if (op_ret == -1 || + !(zero_filled || ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_msg_trace(ioc_inode->table->xl->name, 0, + "cache for inode(%p) is invalid. flushing " + "all pages", + ioc_inode); + destroy_size = __ioc_inode_flush(ioc_inode); + } - table = ioc_inode->table; - GF_ASSERT (table); + if ((op_ret >= 0) && !zero_filled) { + ioc_inode->cache.mtime = stbuf->ia_mtime; + ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + } - zero_filled = ((op_ret >=0) && (stbuf->ia_mtime == 0)); + ioc_inode->cache.last_revalidate = gf_time(); - ioc_inode_lock (ioc_inode); - { - if (op_ret == -1 || !(zero_filled || - ioc_cache_still_valid(ioc_inode, - stbuf))) { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "cache for inode(%p) is invalid. flushing " - "all pages", ioc_inode); - destroy_size = __ioc_inode_flush (ioc_inode); + if (op_ret < 0) { + /* error, readv returned -1 */ + page = __ioc_page_get(ioc_inode, offset); + if (page) + waitq = __ioc_page_error(page, op_ret, op_errno); + } else { + gf_msg_trace(ioc_inode->table->xl->name, 0, "op_ret = %d", op_ret); + page = __ioc_page_get(ioc_inode, offset); + if (!page) { + /* page was flushed */ + /* some serious bug ? */ + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_WASTED_COPY, "offset=%" PRId64, offset, + "page-size=%" PRId64, table->page_size, "ioc_inode=%p", + ioc_inode, NULL); + } else { + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); + page->vector = NULL; + page->iobref = NULL; } - if ((op_ret >= 0) && !zero_filled) { - ioc_inode->cache.mtime = stbuf->ia_mtime; - ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec; + /* keep a copy of the page for our cache */ + page->vector = iov_dup(vector, count); + if (page->vector == NULL) { + page = __ioc_page_get(ioc_inode, offset); + if (page != NULL) + waitq = __ioc_page_error(page, -1, ENOMEM); + goto unlock; } - gettimeofday (&ioc_inode->cache.tv, NULL); - - if (op_ret < 0) { - /* error, readv returned -1 */ - page = __ioc_page_get (ioc_inode, offset); - if (page) - waitq = __ioc_page_error (page, op_ret, - op_errno); + page->count = count; + if (iobref) { + page->iobref = iobref_ref(iobref); } else { - gf_log (ioc_inode->table->xl->name, GF_LOG_TRACE, - "op_ret = %d", op_ret); - page = __ioc_page_get (ioc_inode, offset); - if (!page) { - /* page was flushed */ - /* some serious bug ? */ - gf_log (frame->this->name, GF_LOG_WARNING, - "wasted copy: %"PRId64"[+%"PRId64"] " - "ioc_inode=%p", offset, - table->page_size, ioc_inode); - } else { - if (page->vector) { - iobref_unref (page->iobref); - GF_FREE (page->vector); - page->vector = NULL; - } - - /* keep a copy of the page for our cache */ - page->vector = iov_dup (vector, count); - if (page->vector == NULL) { - page = __ioc_page_get (ioc_inode, - offset); - if (page != NULL) - waitq = __ioc_page_error (page, - -1, - ENOMEM); - goto unlock; - } - - page->count = count; - if (iobref) { - page->iobref = iobref_ref (iobref); - } else { - /* TODO: we have got a response to - * our request and no data */ - gf_log (frame->this->name, - GF_LOG_CRITICAL, - "frame>root>rsp_refs is null"); - } /* if(frame->root->rsp_refs) */ - - /* page->size should indicate exactly how - * much the readv call to the child - * translator returned. earlier op_ret - * from child translator was used, which - * gave rise to a bug where reads from - * io-cached volume were resulting in 0 - * byte replies */ - page_size = iov_length(vector, count); - page->size = page_size; - - iobref_page_size = iobref_size (page->iobref); - - if (page->waitq) { - /* wake up all the frames waiting on - * this page, including - * the frame which triggered fault */ - waitq = __ioc_page_wakeup (page); - } /* if(page->waitq) */ - } /* if(!page)...else */ - } /* if(op_ret < 0)...else */ - } /* ioc_inode locked region end */ + /* TODO: we have got a response to + * our request and no data */ + gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM, + IO_CACHE_MSG_FRAME_NULL, NULL); + } /* if(frame->root->rsp_refs) */ + + /* page->size should indicate exactly how + * much the readv call to the child + * translator returned. earlier op_ret + * from child translator was used, which + * gave rise to a bug where reads from + * io-cached volume were resulting in 0 + * byte replies */ + page_size = iov_length(vector, count); + page->size = page_size; + page->op_errno = op_errno; + + iobref_page_size = iobref_size(page->iobref); + + if (page->waitq) { + /* wake up all the frames waiting on + * this page, including + * the frame which triggered fault */ + waitq = __ioc_page_wakeup(page, op_errno); + } /* if(page->waitq) */ + } /* if(!page)...else */ + } /* if(op_ret < 0)...else */ + } /* ioc_inode locked region end */ unlock: - ioc_inode_unlock (ioc_inode); + ioc_inode_unlock(ioc_inode); - ioc_waitq_return (waitq); + ioc_waitq_return(waitq); - if (iobref_page_size) { - ioc_table_lock (table); - { - table->cache_used += iobref_page_size; - } - ioc_table_unlock (table); + if (iobref_page_size) { + ioc_table_lock(table); + { + table->cache_used += iobref_page_size; } + ioc_table_unlock(table); + } - if (destroy_size) { - ioc_table_lock (table); - { - table->cache_used -= destroy_size; - } - ioc_table_unlock (table); + if (destroy_size) { + ioc_table_lock(table); + { + table->cache_used -= destroy_size; } + ioc_table_unlock(table); + } - if (ioc_need_prune (ioc_inode->table)) { - ioc_prune (ioc_inode->table); - } + if (ioc_need_prune(ioc_inode->table)) { + ioc_prune(ioc_inode->table); + } - gf_log (frame->this->name, GF_LOG_TRACE, "fault frame %p returned", - frame); - pthread_mutex_destroy (&local->local_lock); + gf_msg_trace(frame->this->name, 0, "fault frame %p returned", frame); + pthread_mutex_destroy(&local->local_lock); - fd_unref (local->fd); + fd_unref(local->fd); + if (local->xattr_req) + dict_unref(local->xattr_req); - STACK_DESTROY (frame->root); - return 0; + STACK_DESTROY(frame->root); + return 0; } - /* * ioc_page_fault - * @@ -575,213 +556,212 @@ unlock: * */ void -ioc_page_fault (ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, - off_t offset) +ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd, + off_t offset) { - ioc_table_t *table = NULL; - call_frame_t *fault_frame = NULL; - ioc_local_t *fault_local = NULL; - int32_t op_ret = -1, op_errno = -1; - ioc_waitq_t *waitq = NULL; - ioc_page_t *page = NULL; - - GF_ASSERT (ioc_inode); - if (frame == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log ("io-cache", GF_LOG_WARNING, - "page fault on a NULL frame"); - goto err; - } - - table = ioc_inode->table; - fault_frame = copy_frame (frame); - if (fault_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - fault_local = GF_CALLOC (1, sizeof (ioc_local_t), - gf_ioc_mt_ioc_local_t); - if (fault_local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - STACK_DESTROY (fault_frame->root); - goto err; - } - - /* NOTE: copy_frame() means, the frame the fop whose fd_ref we - * are using till now won't be valid till we get reply from server. - * we unref this fd, in fault_cbk */ - fault_local->fd = fd_ref (fd); - - fault_frame->local = fault_local; - pthread_mutex_init (&fault_local->local_lock, NULL); - - INIT_LIST_HEAD (&fault_local->fill_list); - fault_local->pending_offset = offset; - fault_local->pending_size = table->page_size; - fault_local->inode = ioc_inode; - - gf_log (frame->this->name, GF_LOG_TRACE, - "stack winding page fault for offset = %"PRId64" with " - "frame %p", offset, fault_frame); - - STACK_WIND (fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), - FIRST_CHILD(fault_frame->this)->fops->readv, fd, - table->page_size, offset); - return; + ioc_table_t *table = NULL; + call_frame_t *fault_frame = NULL; + ioc_local_t *fault_local = NULL; + ioc_local_t *local = NULL; + int32_t op_ret = -1, op_errno = -1; + ioc_waitq_t *waitq = NULL; + ioc_page_t *page = NULL; + + GF_ASSERT(ioc_inode); + if (frame == NULL) { + op_ret = -1; + op_errno = EINVAL; + gf_smsg("io-cache", GF_LOG_WARNING, EINVAL, IO_CACHE_MSG_PAGE_FAULT, + NULL); + goto err; + } + + table = ioc_inode->table; + fault_frame = copy_frame(frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + local = frame->local; + fault_local = mem_get0(THIS->local_pool); + if (fault_local == NULL) { + op_ret = -1; + op_errno = ENOMEM; + STACK_DESTROY(fault_frame->root); + goto err; + } + + /* NOTE: copy_frame() means, the frame the fop whose fd_ref we + * are using till now won't be valid till we get reply from server. + * we unref this fd, in fault_cbk */ + fault_local->fd = fd_ref(fd); + + fault_frame->local = fault_local; + pthread_mutex_init(&fault_local->local_lock, NULL); + + INIT_LIST_HEAD(&fault_local->fill_list); + fault_local->pending_offset = offset; + fault_local->pending_size = table->page_size; + fault_local->inode = ioc_inode; + + if (local && local->xattr_req) + fault_local->xattr_req = dict_ref(local->xattr_req); + + gf_msg_trace(frame->this->name, 0, + "stack winding page fault for offset = %" PRId64 + " with " + "frame %p", + offset, fault_frame); + + STACK_WIND(fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, fd, + table->page_size, offset, 0, fault_local->xattr_req); + return; err: - ioc_inode_lock (ioc_inode); - { - page = __ioc_page_get (ioc_inode, offset); - if (page != NULL) { - waitq = __ioc_page_error (page, op_ret, op_errno); - } + ioc_inode_lock(ioc_inode); + { + page = __ioc_page_get(ioc_inode, offset); + if (page != NULL) { + waitq = __ioc_page_error(page, op_ret, op_errno); } - ioc_inode_unlock (ioc_inode); + } + ioc_inode_unlock(ioc_inode); - if (waitq != NULL) { - ioc_waitq_return (waitq); - } + if (waitq != NULL) { + ioc_waitq_return(waitq); + } } - int32_t -__ioc_frame_fill (ioc_page_t *page, call_frame_t *frame, off_t offset, - size_t size) +__ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, + size_t size, int32_t op_errno) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ioc_inode_t *ioc_inode = NULL; - ioc_fill_t *new = NULL; - int8_t found = 0; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("io-cache", frame, out); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, out); + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_fill_t *new = NULL; + int8_t found = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("io-cache", frame, out); + + local = frame->local; + GF_VALIDATE_OR_GOTO(frame->this->name, local, out); + + if (page == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + IO_CACHE_MSG_SERVE_READ_REQUEST, NULL); + local->op_ret = -1; + local->op_errno = EINVAL; + goto out; + } + + ioc_inode = page->inode; + + gf_msg_trace(frame->this->name, 0, + "frame (%p) offset = %" PRId64 " && size = %" GF_PRI_SIZET + " " + "&& page->size = %" GF_PRI_SIZET " && wait_count = %d", + frame, offset, size, page->size, local->wait_count); + + /* immediately move this page to the end of the page_lru list */ + list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru); + /* fill local->pending_size bytes from local->pending_offset */ + if (local->op_ret != -1) { + local->op_errno = op_errno; + + if (page->size == 0) { + goto done; + } - if (page == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "NULL page has been provided to serve read request"); - local->op_ret = -1; - local->op_errno = EINVAL; - goto out; + if (offset > page->offset) + /* offset is offset in file, convert it to offset in + * page */ + src_offset = offset - page->offset; + /*FIXME: since offset is the offset within page is the + * else case valid? */ + else + /* local->pending_offset is in previous page. do not + * fill until we have filled all previous pages */ + dst_offset = page->offset - offset; + + /* we have to copy from offset to either end of this page + * or till the requested size */ + copy_size = min(page->size - src_offset, size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; } - ioc_inode = page->inode; - - gf_log (frame->this->name, GF_LOG_TRACE, - "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" " - "&& page->size = %"GF_PRI_SIZET" && wait_count = %d", - frame, offset, size, page->size, local->wait_count); - - /* immediately move this page to the end of the page_lru list */ - list_move_tail (&page->page_lru, &ioc_inode->cache.page_lru); - /* fill local->pending_size bytes from local->pending_offset */ - if (local->op_ret != -1 && page->size) { - if (offset > page->offset) - /* offset is offset in file, convert it to offset in - * page */ - src_offset = offset - page->offset; - /*FIXME: since offset is the offset within page is the - * else case valid? */ - else - /* local->pending_offset is in previous page. do not - * fill until we have filled all previous pages */ - dst_offset = page->offset - offset; - - /* we have to copy from offset to either end of this page - * or till the requested size */ - copy_size = min (page->size - src_offset, - size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } + gf_msg_trace(page->inode->table->xl->name, 0, + "copy_size = %" GF_PRI_SIZET + " && src_offset = " + "%" PRId64 " && dst_offset = %" PRId64 "", + copy_size, src_offset, dst_offset); - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "copy_size = %"GF_PRI_SIZET" && src_offset = " - "%"PRId64" && dst_offset = %"PRId64"", - copy_size, src_offset, dst_offset); + { + new = GF_CALLOC(1, sizeof(*new), gf_ioc_mt_ioc_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref(page->iobref); + new->count = iov_subset(page->vector, page->count, src_offset, + copy_size, &new->vector, 0); + if (new->count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(new->iobref); + GF_FREE(new); + goto out; + } + + /* add the ioc_fill to fill_list for this frame */ + if (list_empty(&local->fill_list)) { + /* if list is empty, then this is the first + * time we are filling frame, add the + * ioc_fill_t to the end of list */ + list_add_tail(&new->list, &local->fill_list); + } else { + found = 0; + /* list is not empty, we need to look for + * where this offset fits in list */ + list_for_each_entry(fill, &local->fill_list, list) { - new = GF_CALLOC (1, sizeof (*new), - gf_ioc_mt_ioc_fill_t); - if (new == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, - src_offset + copy_size, - NULL); - - new->vector = GF_CALLOC (new->count, - sizeof (struct iovec), - gf_ioc_mt_iovec); - if (new->vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - - iobref_unref (new->iobref); - GF_FREE (new); - goto out; - } - - new->count = iov_subset (page->vector, page->count, - src_offset, - src_offset + copy_size, - new->vector); - - /* add the ioc_fill to fill_list for this frame */ - if (list_empty (&local->fill_list)) { - /* if list is empty, then this is the first - * time we are filling frame, add the - * ioc_fill_t to the end of list */ - list_add_tail (&new->list, &local->fill_list); - } else { - found = 0; - /* list is not empty, we need to look for - * where this offset fits in list */ - list_for_each_entry (fill, &local->fill_list, - list) { - if (fill->offset > new->offset) { - found = 1; - break; - } - } - - if (found) { - list_add_tail (&new->list, - &fill->list); - } else { - list_add_tail (&new->list, - &local->fill_list); - } - } + if (fill->offset > new->offset) { + found = 1; + break; + } } - local->op_ret += copy_size; + if (found) { + list_add_tail(&new->list, &fill->list); + } else { + list_add_tail(&new->list, &local->fill_list); + } + } } - ret = 0; + local->op_ret += copy_size; + } + +done: + ret = 0; out: - return ret; + return ret; } /* @@ -794,96 +774,109 @@ out: * */ static void -ioc_frame_unwind (call_frame_t *frame) +ioc_frame_unwind(call_frame_t *frame) { - ioc_local_t *local = NULL; - ioc_fill_t *fill = NULL, *next = NULL; - int32_t count = 0; - struct iovec *vector = NULL; - int32_t copied = 0; - struct iobref *iobref = NULL; - struct iatt stbuf = {0,}; - int32_t op_ret = 0; - - GF_ASSERT (frame); - - local = frame->local; - if (local == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "local is NULL"); - op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - // ioc_local_lock (local); - frame->local = NULL; - iobref = iobref_new (); - if (iobref == NULL) { + ioc_local_t *local = NULL; + ioc_fill_t *fill = NULL, *next = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + struct iatt stbuf = { + 0, + }; + int32_t op_ret = 0, op_errno = 0; + + GF_ASSERT(frame); + + local = frame->local; + if (local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + IO_CACHE_MSG_LOCAL_NULL, NULL); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + if (local->op_ret < 0) { + op_ret = local->op_ret; + op_errno = local->op_errno; + goto unwind; + } + + // ioc_local_lock (local); + iobref = iobref_new(); + if (iobref == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + if (list_empty(&local->fill_list)) { + gf_msg_trace(frame->this->name, 0, + "frame(%p) has 0 entries in local->fill_list " + "(offset = %" PRId64 " && size = %" GF_PRI_SIZET ")", + frame, local->offset, local->size); + } + + list_for_each_entry(fill, &local->fill_list, list) { count += fill->count; } + + vector = GF_CALLOC(count, sizeof(*vector), gf_ioc_mt_iovec); + if (vector == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } + + list_for_each_entry_safe(fill, next, &local->fill_list, list) + { + /* # TODO: check why this if clause is needed at all. */ + if ((vector != NULL) && (iobref != NULL)) { + memcpy(((char *)vector) + copied, fill->vector, + fill->count * sizeof(*vector)); + + copied += (fill->count * sizeof(*vector)); + + if (iobref_merge(iobref, fill->iobref)) { op_ret = -1; - local->op_errno = ENOMEM; - } - - if (list_empty (&local->fill_list)) { - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) has 0 entries in local->fill_list " - "(offset = %"PRId64" && size = %"GF_PRI_SIZET")", - frame, local->offset, local->size); - } - - list_for_each_entry (fill, &local->fill_list, list) { - count += fill->count; - } - - vector = GF_CALLOC (count, sizeof (*vector), gf_ioc_mt_iovec); - if (vector == NULL) { - op_ret = -1; - local->op_errno = ENOMEM; + op_errno = ENOMEM; + } } - list_for_each_entry_safe (fill, next, &local->fill_list, list) { - if ((vector != NULL) && (iobref != NULL)) { - memcpy (((char *)vector) + copied, - fill->vector, - fill->count * sizeof (*vector)); - - copied += (fill->count * sizeof (*vector)); - - iobref_merge (iobref, fill->iobref); - } - - list_del (&fill->list); - iobref_unref (fill->iobref); - GF_FREE (fill->vector); - GF_FREE (fill); - } + list_del(&fill->list); + iobref_unref(fill->iobref); + GF_FREE(fill->vector); + GF_FREE(fill); + } - if (op_ret != -1) { - op_ret = iov_length (vector, count); - } + if (op_ret != -1) { + op_ret = iov_length(vector, count); + } unwind: - gf_log (frame->this->name, GF_LOG_TRACE, - "frame(%p) unwinding with op_ret=%d", frame, op_ret); - - // ioc_local_unlock (local); - - STACK_UNWIND_STRICT (readv, frame, op_ret, local->op_errno, vector, - count, &stbuf, iobref); - - if (iobref != NULL) { - iobref_unref (iobref); - } - - if (vector != NULL) { - GF_FREE (vector); - vector = NULL; - } - - pthread_mutex_destroy (&local->local_lock); - GF_FREE (local); - - return; + gf_msg_trace(frame->this->name, 0, "frame(%p) unwinding with op_ret=%d", + frame, op_ret); + + // ioc_local_unlock (local); + + frame->local = NULL; + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, &stbuf, + iobref, NULL); + + if (iobref != NULL) { + iobref_unref(iobref); + } + + if (vector != NULL) { + GF_FREE(vector); + vector = NULL; + } + + if (local) { + if (local->xattr_req) + dict_unref(local->xattr_req); + pthread_mutex_destroy(&local->local_lock); + mem_put(local); + } + return; } /* @@ -893,27 +886,27 @@ unwind: * to be called only when a frame is waiting on an in-transit page */ void -ioc_frame_return (call_frame_t *frame) +ioc_frame_return(call_frame_t *frame) { - ioc_local_t *local = NULL; - int32_t wait_count = 0; + ioc_local_t *local = NULL; + int32_t wait_count = 0; - GF_ASSERT (frame); + GF_ASSERT(frame); - local = frame->local; - GF_ASSERT (local->wait_count > 0); + local = frame->local; + GF_ASSERT(local->wait_count > 0); - ioc_local_lock (local); - { - wait_count = --local->wait_count; - } - ioc_local_unlock (local); + ioc_local_lock(local); + { + wait_count = --local->wait_count; + } + ioc_local_unlock(local); - if (!wait_count) { - ioc_frame_unwind (frame); - } + if (!wait_count) { + ioc_frame_unwind(frame); + } - return; + return; } /* @@ -923,37 +916,39 @@ ioc_frame_return (call_frame_t *frame) * to be called only when a frame is waiting on an in-transit page */ ioc_waitq_t * -__ioc_page_wakeup (ioc_page_t *page) +__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - int32_t ret = -1; + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int32_t ret = -1; - GF_VALIDATE_OR_GOTO ("io-cache", page, out); + GF_VALIDATE_OR_GOTO("io-cache", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - page->ready = 1; + page->ready = 1; - gf_log (page->inode->table->xl->name, GF_LOG_TRACE, - "page is %p && waitq = %p", page, waitq); + gf_msg_trace(page->inode->table->xl->name, 0, "page is %p && waitq = %p", + page, waitq); - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ret = __ioc_frame_fill (page, frame, trav->pending_offset, - trav->pending_size); - if (ret == -1) { - break; - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ret = __ioc_frame_fill(page, frame, trav->pending_offset, + trav->pending_size, op_errno); + if (ret == -1) { + break; } + } + + if (page->stale) { + __ioc_page_destroy(page); + } out: - return waitq; + return waitq; } - - /* * ioc_page_error - * @page: @@ -962,46 +957,45 @@ out: * */ ioc_waitq_t * -__ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) +__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno) { - ioc_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; - int64_t ret = 0; - ioc_table_t *table = NULL; - ioc_local_t *local = NULL; + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int64_t ret = 0; + ioc_table_t *table = NULL; + ioc_local_t *local = NULL; - GF_VALIDATE_OR_GOTO ("io-cache", page, out); + GF_VALIDATE_OR_GOTO("io-cache", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - gf_log (page->inode->table->xl->name, GF_LOG_WARNING, - "page error for page = %p & waitq = %p", page, waitq); + gf_msg_debug(page->inode->table->xl->name, 0, + "page error for page = %p & waitq = %p", page, waitq); - for (trav = waitq; trav; trav = trav->next) { + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - frame = trav->data; - - local = frame->local; - ioc_local_lock (local); - { - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - } - ioc_local_unlock (local); + local = frame->local; + ioc_local_lock(local); + { + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } } + ioc_local_unlock(local); + } - table = page->inode->table; - ret = __ioc_page_destroy (page); + table = page->inode->table; + ret = __ioc_page_destroy(page); - if (ret != -1) { - table->cache_used -= ret; - } + if (ret != -1) { + table->cache_used -= ret; + } out: - return waitq; + return waitq; } /* @@ -1012,20 +1006,22 @@ out: * */ ioc_waitq_t * -ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno) +ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno) { - ioc_waitq_t *waitq = NULL; + ioc_waitq_t *waitq = NULL; + struct ioc_inode *inode = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - ioc_inode_lock (page->inode); - { - waitq = __ioc_page_error (page, op_ret, op_errno); - } - ioc_inode_unlock (page->inode); + ioc_inode_lock(page->inode); + { + inode = page->inode; + waitq = __ioc_page_error(page, op_ret, op_errno); + } + ioc_inode_unlock(inode); out: - return waitq; + return waitq; } diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am index 72f9a801287..7570cf41ed2 100644 --- a/xlators/performance/io-threads/src/Makefile.am +++ b/xlators/performance/io-threads/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = io-threads.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_threads_la_LDFLAGS = -module -avoidversion +io_threads_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) io_threads_la_SOURCES = io-threads.c io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-threads.h iot-mem-types.h +noinst_HEADERS = io-threads.h iot-mem-types.h io-threads-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/io-threads/src/io-threads-messages.h b/xlators/performance/io-threads/src/io-threads-messages.h new file mode 100644 index 00000000000..6229c353f96 --- /dev/null +++ b/xlators/performance/io-threads/src/io-threads-messages.h @@ -0,0 +1,41 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _IO_THREADS_MESSAGES_H_ +#define _IO_THREADS_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(IO_THREADS, IO_THREADS_MSG_INIT_FAILED, + IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, IO_THREADS_MSG_NO_MEMORY, + IO_THREADS_MSG_VOL_MISCONFIGURED, IO_THREADS_MSG_SIZE_NOT_SET, + IO_THREADS_MSG_OUT_OF_MEMORY, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED); + +#define IO_THREADS_MSG_INIT_FAILED_STR "Thread attribute initialization failed" +#define IO_THREADS_MSG_SIZE_NOT_SET_STR "Using default thread stack size" +#define IO_THREADS_MSG_NO_MEMORY_STR "Memory accounting init failed" +#define IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED_STR \ + "FATAL: iot not configured with exactly one child" +#define IO_THREADS_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile" +#define IO_THREADS_MSG_OUT_OF_MEMORY_STR "out of memory" +#define IO_THREADS_MSG_PTHREAD_INIT_FAILED_STR "init failed" +#define IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED_STR \ + "cannot initialize worker threads, exiting init" +#endif /* _IO_THREADS_MESSAGES_H_ */ diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index d6d0ada23e4..3d24cc97f4b 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -1,211 +1,307 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "call-stub.h" -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "io-threads.h" +#include <signal.h> #include <stdlib.h> #include <sys/time.h> #include <time.h> -#include "locking.h" +#include <glusterfs/locking.h> +#include "io-threads-messages.h" +#include <glusterfs/timespec.h> -void *iot_worker (void *arg); -int iot_workers_scale (iot_conf_t *conf); -int __iot_workers_scale (iot_conf_t *conf); +void * +iot_worker(void *arg); +int +iot_workers_scale(iot_conf_t *conf); +int +__iot_workers_scale(iot_conf_t *conf); struct volume_options options[]; +#define IOT_FOP(name, frame, this, args...) \ + do { \ + call_stub_t *__stub = NULL; \ + int __ret = -1; \ + \ + __stub = fop_##name##_stub(frame, default_##name##_resume, args); \ + if (!__stub) { \ + __ret = -ENOMEM; \ + goto out; \ + } \ + \ + __ret = iot_schedule(frame, this, __stub); \ + \ + out: \ + if (__ret < 0) { \ + default_##name##_failure_cbk(frame, -__ret); \ + if (__stub != NULL) { \ + call_stub_destroy(__stub); \ + } \ + } \ + } while (0) + +iot_client_ctx_t * +iot_get_ctx(xlator_t *this, client_t *client) +{ + iot_client_ctx_t *ctx = NULL; + iot_client_ctx_t *setted_ctx = NULL; + int i; + + if (client_ctx_get(client, this, (void **)&ctx) != 0) { + ctx = GF_MALLOC(GF_FOP_PRI_MAX * sizeof(*ctx), gf_iot_mt_client_ctx_t); + if (ctx) { + for (i = 0; i < GF_FOP_PRI_MAX; ++i) { + INIT_LIST_HEAD(&ctx[i].clients); + INIT_LIST_HEAD(&ctx[i].reqs); + } + setted_ctx = client_ctx_set(client, this, ctx); + if (ctx != setted_ctx) { + GF_FREE(ctx); + ctx = setted_ctx; + } + } + } + + return ctx; +} + call_stub_t * -__iot_dequeue (iot_conf_t *conf, int *pri) -{ - call_stub_t *stub = NULL; - int i = 0; - - *pri = -1; - for (i = 0; i < IOT_PRI_MAX; i++) { - if (list_empty (&conf->reqs[i]) || - (conf->ac_iot_count[i] >= conf->ac_iot_limit[i])) - continue; - stub = list_entry (conf->reqs[i].next, call_stub_t, list); - conf->ac_iot_count[i]++; - *pri = i; - break; +__iot_dequeue(iot_conf_t *conf, int *pri) +{ + call_stub_t *stub = NULL; + int i = 0; + iot_client_ctx_t *ctx; + + *pri = -1; + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (conf->ac_iot_count[i] >= conf->ac_iot_limit[i]) { + continue; } - if (!stub) - return NULL; + if (list_empty(&conf->clients[i])) { + continue; + } - conf->queue_size--; - list_del_init (&stub->list); + /* Get the first per-client queue for this priority. */ + ctx = list_first_entry(&conf->clients[i], iot_client_ctx_t, clients); + if (!ctx) { + continue; + } - return stub; -} + if (list_empty(&ctx->reqs)) { + continue; + } + + /* Get the first request on that queue. */ + stub = list_first_entry(&ctx->reqs, call_stub_t, list); + list_del_init(&stub->list); + if (list_empty(&ctx->reqs)) { + list_del_init(&ctx->clients); + } else { + list_rotate_left(&conf->clients[i]); + } + + conf->ac_iot_count[i]++; + conf->queue_marked[i] = _gf_false; + *pri = i; + break; + } + if (!stub) + return NULL; + + conf->queue_size--; + conf->queue_sizes[*pri]--; + + return stub; +} void -__iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri) +__iot_enqueue(iot_conf_t *conf, call_stub_t *stub, int pri) { - if (pri < 0 || pri >= IOT_PRI_MAX) - pri = IOT_PRI_MAX-1; + client_t *client = stub->frame->root->client; + iot_client_ctx_t *ctx; - list_add_tail (&stub->list, &conf->reqs[pri]); + if (pri < 0 || pri >= GF_FOP_PRI_MAX) + pri = GF_FOP_PRI_MAX - 1; - conf->queue_size++; + if (client) { + ctx = iot_get_ctx(THIS, client); + if (ctx) { + ctx = &ctx[pri]; + } + } else { + ctx = NULL; + } + if (!ctx) { + ctx = &conf->no_client[pri]; + } - return; -} + if (list_empty(&ctx->reqs)) { + list_add_tail(&ctx->clients, &conf->clients[pri]); + } + list_add_tail(&stub->list, &ctx->reqs); + conf->queue_size++; + GF_ATOMIC_INC(conf->stub_cnt); + conf->queue_sizes[pri]++; +} void * -iot_worker (void *data) -{ - iot_conf_t *conf = NULL; - xlator_t *this = NULL; - call_stub_t *stub = NULL; - struct timespec sleep_till = {0, }; - int ret = 0; - int pri = -1; - char timeout = 0; - char bye = 0; +iot_worker(void *data) +{ + iot_conf_t *conf = NULL; + xlator_t *this = NULL; + call_stub_t *stub = NULL; + struct timespec sleep_till = { + 0, + }; + int ret = 0; + int pri = -1; + gf_boolean_t bye = _gf_false; + + conf = data; + this = conf->this; + THIS = this; + + for (;;) { + pthread_mutex_lock(&conf->mutex); + { + if (pri != -1) { + conf->ac_iot_count[pri]--; + pri = -1; + } + while (conf->queue_size == 0) { + if (conf->down) { + bye = _gf_true; /*Avoid sleep*/ + break; + } - conf = data; - this = conf->this; - THIS = this; + clock_gettime(CLOCK_REALTIME_COARSE, &sleep_till); + sleep_till.tv_sec += conf->idle_time; - for (;;) { - sleep_till.tv_sec = time (NULL) + conf->idle_time; + conf->sleep_count++; + ret = pthread_cond_timedwait(&conf->cond, &conf->mutex, + &sleep_till); + conf->sleep_count--; - pthread_mutex_lock (&conf->mutex); - { - if (pri != -1) { - conf->ac_iot_count[pri]--; - pri = -1; - } - while (conf->queue_size == 0) { - conf->sleep_count++; - - ret = pthread_cond_timedwait (&conf->cond, - &conf->mutex, - &sleep_till); - conf->sleep_count--; - - if (ret == ETIMEDOUT) { - timeout = 1; - break; - } - } - - if (timeout) { - if (conf->curr_count > IOT_MIN_THREADS) { - conf->curr_count--; - bye = 1; - gf_log (conf->this->name, GF_LOG_DEBUG, - "timeout, terminated. conf->curr_count=%d", - conf->curr_count); - } else { - timeout = 0; - } - } - - stub = __iot_dequeue (conf, &pri); + if (conf->down || ret == ETIMEDOUT) { + bye = _gf_true; + break; } - pthread_mutex_unlock (&conf->mutex); - - if (stub) /* guard against spurious wakeups */ - call_resume (stub); + } + + if (bye) { + if (conf->down || conf->curr_count > IOT_MIN_THREADS) { + conf->curr_count--; + if (conf->curr_count == 0) + pthread_cond_broadcast(&conf->cond); + gf_msg_debug(conf->this->name, 0, + "terminated. " + "conf->curr_count=%d", + conf->curr_count); + } else { + bye = _gf_false; + } + } - if (bye) - break; + if (!bye) + stub = __iot_dequeue(conf, &pri); } + pthread_mutex_unlock(&conf->mutex); - if (pri != -1) { - pthread_mutex_lock (&conf->mutex); - { - conf->ac_iot_count[pri]--; - } - pthread_mutex_unlock (&conf->mutex); + if (stub) { /* guard against spurious wakeups */ + if (stub->poison) { + gf_log(this->name, GF_LOG_INFO, "Dropping poisoned request %p.", + stub); + call_stub_destroy(stub); + } else { + call_resume(stub); + } + GF_ATOMIC_DEC(conf->stub_cnt); } - return NULL; -} + stub = NULL; + + if (bye) + break; + } + return NULL; +} int -do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri) +do_iot_schedule(iot_conf_t *conf, call_stub_t *stub, int pri) { - int ret = 0; + int ret = 0; - pthread_mutex_lock (&conf->mutex); - { - __iot_enqueue (conf, stub, pri); + pthread_mutex_lock(&conf->mutex); + { + __iot_enqueue(conf, stub, pri); - pthread_cond_signal (&conf->cond); + pthread_cond_signal(&conf->cond); - ret = __iot_workers_scale (conf); - } - pthread_mutex_unlock (&conf->mutex); + ret = __iot_workers_scale(conf); + } + pthread_mutex_unlock(&conf->mutex); - return ret; + return ret; } -char* -iot_get_pri_meaning (iot_pri_t pri) -{ - char *name = NULL; - switch (pri) { - case IOT_PRI_HI: - name = "fast"; - break; - case IOT_PRI_NORMAL: - name = "normal"; - break; - case IOT_PRI_LO: - name = "slow"; - break; - case IOT_PRI_LEAST: - name = "least priority"; - break; - case IOT_PRI_MAX: - name = "invalid"; - break; - } - return name; +char * +iot_get_pri_meaning(gf_fop_pri_t pri) +{ + char *name = NULL; + switch (pri) { + case GF_FOP_PRI_HI: + name = "fast"; + break; + case GF_FOP_PRI_NORMAL: + name = "normal"; + break; + case GF_FOP_PRI_LO: + name = "slow"; + break; + case GF_FOP_PRI_LEAST: + name = "least"; + break; + case GF_FOP_PRI_MAX: + name = "invalid"; + break; + case GF_FOP_PRI_UNSPEC: + name = "unspecified"; + break; + } + return name; } int -iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) +iot_schedule(call_frame_t *frame, xlator_t *this, call_stub_t *stub) { - int ret = -1; - iot_pri_t pri = IOT_PRI_MAX - 1; + int ret = -1; + gf_fop_pri_t pri = GF_FOP_PRI_MAX - 1; + iot_conf_t *conf = this->private; - if (frame->root->pid < 0) { - pri = IOT_PRI_LEAST; - goto out; - } + if ((frame->root->pid < GF_CLIENT_PID_MAX) && + (frame->root->pid != GF_CLIENT_PID_NO_ROOT_SQUASH) && + conf->least_priority) { + pri = GF_FOP_PRI_LEAST; + goto out; + } - switch (stub->fop) { + switch (stub->fop) { case GF_FOP_OPEN: case GF_FOP_STAT: case GF_FOP_FSTAT: @@ -216,8 +312,12 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_STATFS: case GF_FOP_READDIR: case GF_FOP_READDIRP: - pri = IOT_PRI_HI; - break; + case GF_FOP_GETACTIVELK: + case GF_FOP_SETACTIVELK: + case GF_FOP_ICREATE: + case GF_FOP_NAMELINK: + pri = GF_FOP_PRI_HI; + break; case GF_FOP_CREATE: case GF_FOP_FLUSH: @@ -226,6 +326,7 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_FINODELK: case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: + case GF_FOP_LEASE: case GF_FOP_UNLINK: case GF_FOP_SETATTR: case GF_FOP_FSETATTR: @@ -240,8 +341,10 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_FGETXATTR: case GF_FOP_FSETXATTR: case GF_FOP_REMOVEXATTR: - pri = IOT_PRI_NORMAL; - break; + case GF_FOP_FREMOVEXATTR: + case GF_FOP_PUT: + pri = GF_FOP_PRI_NORMAL; + break; case GF_FOP_READ: case GF_FOP_WRITE: @@ -251,2332 +354,1237 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_FSYNCDIR: case GF_FOP_XATTROP: case GF_FOP_FXATTROP: - pri = IOT_PRI_LO; - break; - case GF_FOP_RCHECKSUM: - pri = IOT_PRI_LEAST; - break; + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + case GF_FOP_SEEK: + pri = GF_FOP_PRI_LO; + break; - case GF_FOP_NULL: case GF_FOP_FORGET: case GF_FOP_RELEASE: case GF_FOP_RELEASEDIR: case GF_FOP_GETSPEC: - case GF_FOP_MAXVALUE: - //fail compilation on missing fop - //new fop must choose priority. - break; - } + break; + case GF_FOP_IPC: + default: + return -EINVAL; + } out: - ret = do_iot_schedule (this->private, stub, pri); - gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop", - gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); - return ret; + gf_msg_debug(this->name, 0, "%s scheduled as %s priority fop", + gf_fop_list[stub->fop], iot_get_pri_meaning(pri)); + if (this->private) + ret = do_iot_schedule(this->private, stub, pri); + return ret; } int -iot_lookup_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +iot_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xattr, - postparent); - return 0; + IOT_FOP(lookup, frame, this, loc, xdata); + return 0; } - int -iot_lookup_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *xattr_req) +iot_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - STACK_WIND (frame, iot_lookup_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->lookup, - loc, xattr_req); - return 0; + IOT_FOP(setattr, frame, this, loc, stbuf, valid, xdata); + return 0; } - int -iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +iot_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xattr_req); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create lookup stub (out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - if (stub != NULL) { - call_stub_destroy (stub); - } - STACK_UNWIND_STRICT (lookup, frame, -1, -ret, NULL, NULL, NULL, - NULL); - } - - return 0; + IOT_FOP(fsetattr, frame, this, fd, stbuf, valid, xdata); + return 0; } - int -iot_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +iot_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop); - return 0; + IOT_FOP(access, frame, this, loc, mask, xdata); + return 0; } - int -iot_setattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) +iot_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - STACK_WIND (frame, iot_setattr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid); - return 0; -} - - -int -iot_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_setattr_stub (frame, iot_setattr_wrapper, loc, stbuf, valid); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "Cannot create setattr stub" - "(Out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - if (stub != NULL) { - call_stub_destroy (stub); - } - - STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL); - } - - return 0; + IOT_FOP(readlink, frame, this, loc, size, xdata); + return 0; } - int -iot_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +iot_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) { - STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, preop, postop); - return 0; + IOT_FOP(mknod, frame, this, loc, mode, rdev, umask, xdata); + return 0; } - int -iot_fsetattr_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *stbuf, int32_t valid) +iot_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - STACK_WIND (frame, iot_fsetattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid); - return 0; -} - - -int -iot_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsetattr_stub (frame, iot_fsetattr_wrapper, fd, stbuf, - valid); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsetattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(mkdir, frame, this, loc, mode, umask, xdata); + return 0; } - int -iot_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +iot_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno); - return 0; + IOT_FOP(rmdir, frame, this, loc, flags, xdata); + return 0; } - int -iot_access_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t mask) +iot_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) { - STACK_WIND (frame, iot_access_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->access, loc, mask); - return 0; + IOT_FOP(symlink, frame, this, linkname, loc, umask, xdata); + return 0; } - int -iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +iot_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_access_stub (frame, iot_access_wrapper, loc, mask); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create access stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (access, frame, -1, -ret); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(rename, frame, this, oldloc, newloc, xdata); + return 0; } - int -iot_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *stbuf) +iot_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, stbuf); - return 0; + IOT_FOP(open, frame, this, loc, flags, fd, xdata); + return 0; } - int -iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size) +iot_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, iot_readlink_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readlink, - loc, size); - return 0; + IOT_FOP(create, frame, this, loc, flags, mode, umask, fd, xdata); + return 0; } - int -iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) +iot_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, uint32_t flags, struct iovec *vector, int32_t count, + off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create readlink stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(put, frame, this, loc, mode, umask, flags, vector, count, offset, + iobref, xattr, xdata); + return 0; } - int -iot_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +iot_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; + IOT_FOP(readv, frame, this, fd, size, offset, flags, xdata); + return 0; } - int -iot_mknod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) +iot_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, iot_mknod_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mknod, loc, mode, rdev, params); - return 0; + IOT_FOP(flush, frame, this, fd, xdata); + return 0; } - int -iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev, - params); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create mknod stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL, - NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_mkdir_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +iot_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; + IOT_FOP(fsync, frame, this, fd, datasync, xdata); + return 0; } - int -iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) +iot_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - STACK_WIND (frame, iot_mkdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->mkdir, loc, mode, params); - return 0; + IOT_FOP(writev, frame, this, fd, vector, count, offset, flags, iobref, + xdata); + return 0; } - int -iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode, params); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create mkdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL, - NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) +iot_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { - STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, - postparent); - return 0; + IOT_FOP(lk, frame, this, fd, cmd, flock, xdata); + return 0; } - int -iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +iot_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_WIND (frame, iot_rmdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rmdir, loc, flags); - return 0; + IOT_FOP(stat, frame, this, loc, xdata); + return 0; } - int -iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) +iot_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc, flags); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create rmdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(fstat, frame, this, fd, xdata); + return 0; } - int -iot_symlink_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +iot_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; + IOT_FOP(truncate, frame, this, loc, offset, xdata); + return 0; } - int -iot_symlink_wrapper (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, dict_t *params) +iot_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - STACK_WIND (frame, iot_symlink_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->symlink, linkname, loc, params); - return 0; + IOT_FOP(ftruncate, frame, this, fd, offset, xdata); + return 0; } - int -iot_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, - loc_t *loc, dict_t *params) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc, - params); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create symlink stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL, - NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - -int -iot_rename_cbk (call_frame_t *frame, void * cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) +iot_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) { - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); - return 0; + IOT_FOP(unlink, frame, this, loc, xflag, xdata); + return 0; } - int -iot_rename_wrapper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) +iot_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - STACK_WIND (frame, iot_rename_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->rename, oldloc, newloc); - return 0; + IOT_FOP(link, frame, this, oldloc, newloc, xdata); + return 0; } - int -iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) +iot_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc); - if (!stub) { - gf_log (this->name, GF_LOG_DEBUG, "cannot create rename stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(opendir, frame, this, loc, fd, xdata); + return 0; } - int -iot_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) +iot_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); - return 0; + IOT_FOP(fsyncdir, frame, this, fd, datasync, xdata); + return 0; } - int -iot_open_wrapper (call_frame_t * frame, xlator_t * this, loc_t *loc, - int32_t flags, fd_t * fd, int32_t wbflags) +iot_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_WIND (frame, iot_open_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, loc, flags, fd, wbflags); - return 0; -} - - -int -iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd, wbflags); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create open call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(statfs, frame, this, loc, xdata); + return 0; } - int -iot_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent) +iot_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, - preparent, postparent); - return 0; + IOT_FOP(setxattr, frame, this, loc, dict, flags, xdata); + return 0; } - int -iot_create_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, mode_t mode, fd_t *fd, dict_t *params) +iot_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - STACK_WIND (frame, iot_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); - return 0; -} + iot_conf_t *conf = NULL; + dict_t *depths = NULL; + int i = 0; + int32_t op_ret = 0; + int32_t op_errno = 0; + conf = this->private; -int -iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd, dict_t *params) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_create_stub (frame, iot_create_wrapper, loc, flags, mode, - fd, params); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create \"create\" call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; + if (name && strcmp(name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { + /* + * We explicitly do not want a reference count + * for this dict in this translator + */ + depths = dict_new(); + if (!depths) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind_special_getxattr; } - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL, - NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (dict_set_int32(depths, (char *)fop_pri_to_string(i), + conf->queue_sizes[i]) != 0) { + dict_unref(depths); + depths = NULL; + goto unwind_special_getxattr; + } } + unwind_special_getxattr: + STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, depths, xdata); + if (depths) + dict_unref(depths); return 0; -} - - -int -iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) -{ - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); - - return 0; -} - - -int -iot_readv_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - STACK_WIND (frame, iot_readv_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset); - return 0; -} - + } -int -iot_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create readv call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, -1, NULL, - NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(getxattr, frame, this, loc, name, xdata); + return 0; } - int -iot_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +iot_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); - return 0; + IOT_FOP(fgetxattr, frame, this, fd, name, xdata); + return 0; } - int -iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - STACK_WIND (frame, iot_flush_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); - return 0; + IOT_FOP(fsetxattr, frame, this, fd, dict, flags, xdata); + return 0; } - int -iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_flush_stub (frame, iot_flush_wrapper, fd); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create flush_cbk call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (flush, frame, -1, -ret); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(removexattr, frame, this, loc, name, xdata); + return 0; } - int -iot_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +iot_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + IOT_FOP(fremovexattr, frame, this, fd, name, xdata); + return 0; } - int -iot_fsync_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +iot_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - STACK_WIND (frame, iot_fsync_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, - fd, datasync); - return 0; + IOT_FOP(readdirp, frame, this, fd, size, offset, xdata); + return 0; } - int -iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) +iot_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fsync_cbk call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsync, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(readdir, frame, this, fd, size, offset, xdata); + return 0; } - int -iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +iot_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + IOT_FOP(inodelk, frame, this, volume, loc, cmd, lock, xdata); + return 0; } - int -iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, - off_t offset, struct iobref *iobref) +iot_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - STACK_WIND (frame, iot_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, iobref); - return 0; + IOT_FOP(finodelk, frame, this, volume, fd, cmd, lock, xdata); + return 0; } - int -iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +iot_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_writev_stub (frame, iot_writev_wrapper, - fd, vector, count, offset, iobref); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create writev call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(entrylk, frame, this, volume, loc, basename, cmd, type, xdata); + return 0; } - -int32_t -iot_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *flock) -{ - STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock); - return 0; -} - - int -iot_lk_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t cmd, struct gf_flock *flock) +iot_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - STACK_WIND (frame, iot_lk_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, - fd, cmd, flock); - return 0; + IOT_FOP(fentrylk, frame, this, volume, fd, basename, cmd, type, xdata); + return 0; } - int -iot_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *flock) +iot_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_lk call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (lk, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(xattrop, frame, this, loc, optype, xattr, xdata); + return 0; } - int -iot_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +iot_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf); - return 0; + IOT_FOP(fxattrop, frame, this, fd, optype, xattr, xdata); + return 0; } - -int -iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +int32_t +iot_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) { - STACK_WIND (frame, iot_stat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, - loc); - return 0; + IOT_FOP(rchecksum, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_stat_stub (frame, iot_stat_wrapper, loc); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_stat call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (stat, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(fallocate, frame, this, fd, mode, offset, len, xdata); + return 0; } - int -iot_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf); - return 0; + IOT_FOP(discard, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - STACK_WIND (frame, iot_fstat_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, - fd); - return 0; + IOT_FOP(zerofill, frame, this, fd, offset, len, xdata); + return 0; } - int -iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +iot_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) { - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_fstat call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fstat, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + IOT_FOP(seek, frame, this, fd, offset, what, xdata); + return 0; } - int -iot_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +iot_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) { - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + IOT_FOP(lease, frame, this, loc, lease, xdata); + return 0; } - int -iot_truncate_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) +iot_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - STACK_WIND (frame, iot_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - loc, offset); - return 0; + IOT_FOP(getactivelk, frame, this, loc, xdata); + return 0; } - int -iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +iot_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, + lock_migration_info_t *locklist, dict_t *xdata) { - call_stub_t *stub; - int ret = -1; - - stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset); - - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_stat call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; + IOT_FOP(setactivelk, frame, this, loc, locklist, xdata); + return 0; } - int -iot_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +__iot_workers_scale(iot_conf_t *conf) { - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; -} + int scale = 0; + int diff = 0; + pthread_t thread; + int ret = 0; + int i = 0; + for (i = 0; i < GF_FOP_PRI_MAX; i++) + scale += min(conf->queue_sizes[i], conf->ac_iot_limit[i]); -int -iot_ftruncate_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) -{ - STACK_WIND (frame, iot_ftruncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, - fd, offset); - return 0; -} + if (scale < IOT_MIN_THREADS) + scale = IOT_MIN_THREADS; + if (scale > conf->max_count) + scale = conf->max_count; -int -iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_ftruncate call stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (conf->curr_count < scale) { + diff = scale - conf->curr_count; + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (ftruncate, frame, -1, -ret, NULL, NULL); + while (diff) { + diff--; - if (stub != NULL) { - call_stub_destroy (stub); - } + ret = gf_thread_create(&thread, &conf->w_attr, iot_worker, conf, + "iotwr%03hx", conf->curr_count & 0x3ff); + if (ret == 0) { + pthread_detach(thread); + conf->curr_count++; + gf_msg_debug(conf->this->name, 0, + "scaled threads to %d (queue_size=%d/%d)", + conf->curr_count, conf->queue_size, scale); + } else { + break; } - return 0; -} - - + } -int -iot_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, - postparent); - return 0; + return diff; } - int -iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) +iot_workers_scale(iot_conf_t *conf) { - STACK_WIND (frame, iot_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - loc); - return 0; -} + int ret = -1; + if (conf == NULL) { + ret = -EINVAL; + goto out; + } -int -iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, - "cannot create fop_unlink call stub" - "(out of memory)"); - ret = -1; - goto out; - } - - ret = iot_schedule (frame, this, stub); + pthread_mutex_lock(&conf->mutex); + { + ret = __iot_workers_scale(conf); + } + pthread_mutex_unlock(&conf->mutex); out: - if (ret < 0) { - STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - -int -iot_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, struct iatt *postparent) -{ - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, - preparent, postparent); - return 0; + return ret; } - int -iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new) +set_stack_size(iot_conf_t *conf) { - STACK_WIND (frame, iot_link_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->link, old, new); - - return 0; -} + int err = 0; + size_t stacksize = IOT_THREAD_STACK_SIZE; + xlator_t *this = NULL; + this = THIS; -int -iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) -{ - call_stub_t *stub = NULL; - int ret = -1; + err = pthread_attr_init(&conf->w_attr); + if (err != 0) { + gf_smsg(this->name, GF_LOG_ERROR, err, IO_THREADS_MSG_INIT_FAILED, + NULL); + return err; + } - stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create link stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; + err = pthread_attr_setstacksize(&conf->w_attr, stacksize); + if (err == EINVAL) { + err = pthread_attr_getstacksize(&conf->w_attr, &stacksize); + if (!err) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET, + "size=%zd", stacksize, NULL); + } else { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET, + NULL); + err = 0; } + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL, - NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + conf->stack_size = stacksize; + return err; } - -int -iot_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd); - return 0; -} - - -int -iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - STACK_WIND (frame, iot_opendir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->opendir, loc, fd); - return 0; -} - - -int -iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create opendir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno); - return 0; -} - - -int -iot_fsyncdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int datasync) +int32_t +mem_acct_init(xlator_t *this) { - STACK_WIND (frame, iot_fsyncdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsyncdir, fd, datasync); - return 0; -} - + int ret = -1; -int -iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) -{ - call_stub_t *stub = NULL; - int ret = -1; + if (!this) + return ret; - stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsyncdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + ret = xlator_mem_acct_init(this, gf_iot_mt_end + 1); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsyncdir, frame, -1, -ret); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_NO_MEMORY, + NULL); + return ret; + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + return ret; } - int -iot_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct statvfs *buf) +iot_priv_dump(xlator_t *this) { - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf); - return 0; -} + iot_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; - -int -iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - STACK_WIND (frame, iot_statfs_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->statfs, loc); + if (!this) return 0; -} - - -int -iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - call_stub_t *stub = NULL; - int ret = -1; - stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create statfs stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (statfs, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } + conf = this->private; + if (!conf) return 0; -} + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); -int -iot_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno); - return 0; -} + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("maximum_threads_count", "%d", conf->max_count); + gf_proc_dump_write("current_threads_count", "%d", conf->curr_count); + gf_proc_dump_write("sleep_count", "%d", conf->sleep_count); + gf_proc_dump_write("idle_time", "%d", conf->idle_time); + gf_proc_dump_write("stack_size", "%zd", conf->stack_size); + gf_proc_dump_write("max_high_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_HI]); + gf_proc_dump_write("max_normal_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_NORMAL]); + gf_proc_dump_write("max_low_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_LO]); + gf_proc_dump_write("max_least_priority_threads", "%d", + conf->ac_iot_limit[GF_FOP_PRI_LEAST]); + gf_proc_dump_write("current_high_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_HI]); + gf_proc_dump_write("current_normal_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_NORMAL]); + gf_proc_dump_write("current_low_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_LO]); + gf_proc_dump_write("current_least_priority_threads", "%d", + conf->ac_iot_count[GF_FOP_PRI_LEAST]); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + if (!conf->queue_sizes[i]) + continue; + snprintf(key, sizeof(key), "%s_priority_queue_length", + iot_get_pri_meaning(i)); + gf_proc_dump_write(key, "%d", conf->queue_sizes[i]); + } -int -iot_setxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags) -{ - STACK_WIND (frame, iot_setxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setxattr, loc, dict, flags); - return 0; + return 0; } - -int -iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_setxattr_stub (frame, iot_setxattr_wrapper, loc, dict, - flags); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create setxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (setxattr, frame, -1, -ret); - - if (stub != NULL) { - call_stub_destroy (stub); +/* + * We use a decay model to keep track and make sure we're not spawning new + * threads too often. Each increment adds a large value to a counter, and that + * counter keeps ticking back down to zero over a fairly long period. For + * example, let's use ONE_WEEK=604800 seconds, and we want to detect when we + * have N=3 increments during that time. Thus, our threshold is + * (N-1)*ONE_WEEK. To see how it works, look at three examples. + * + * (a) Two events close together, then one more almost a week later. The + * first two events push our counter to 2*ONE_WEEK plus a bit. At the third + * event, we decay down to ONE_WEEK plus a bit and then add ONE_WEEK for the + * new event, exceeding our threshold. + * + * (b) One event, then two more almost a week later. At the time of the + * second and third events, the counter is already non-zero, so when we add + * 2*ONE_WEEK we exceed again. + * + * (c) Three events, spaced three days apart. At the time of the second + * event, we decay down to approxitely ONE_WEEK*4/7 and then add another + * ONE_WEEK. At the third event, we decay again down to ONE_WEEK*8/7 and add + * another ONE_WEEK, so boom. + * + * Note that in all three cases if that last event came a day later our counter + * would have decayed a bit more and we would *not* exceed our threshold. It's + * not exactly the same as a precise "three in one week" limit, but it's very + * close and it allows the same kind of tweaking while requiring only constant + * space - no arrays of variable length N to allocate or maintain. All we need + * (for each queue) is the value plus the time of the last update. + */ + +typedef struct { + time_t update_time; + uint32_t value; +} threshold_t; +/* + * Variables so that I can hack these for testing. + * TBD: make these tunable? + */ +static uint32_t THRESH_SECONDS = 604800; +static uint32_t THRESH_EVENTS = 3; +static uint32_t THRESH_LIMIT = 1209600; /* SECONDS * (EVENTS-1) */ + +static void +iot_apply_event(xlator_t *this, threshold_t *thresh) +{ + time_t delta, now = gf_time(); + + /* Refresh for manual testing/debugging. It's cheap. */ + THRESH_LIMIT = THRESH_SECONDS * (THRESH_EVENTS - 1); + + if (thresh->value && thresh->update_time) { + delta = now - thresh->update_time; + /* Be careful about underflow. */ + if (thresh->value <= delta) { + thresh->value = 0; + } else { + thresh->value -= delta; + } + } + + thresh->value += THRESH_SECONDS; + if (thresh->value >= THRESH_LIMIT) { + gf_log(this->name, GF_LOG_EMERG, "watchdog firing too often"); + /* + * The default action for SIGTRAP is to dump core, but the fact + * that it's distinct from other signals we use means that + * there are other possibilities as well (e.g. drop into gdb or + * invoke a special handler). + */ + kill(getpid(), SIGTRAP); + } + + thresh->update_time = now; +} + +static void * +iot_watchdog(void *arg) +{ + xlator_t *this = arg; + iot_conf_t *priv = this->private; + int i; + int bad_times[GF_FOP_PRI_MAX] = { + 0, + }; + threshold_t thresholds[GF_FOP_PRI_MAX] = {{ + 0, + }}; + + for (;;) { + sleep(max(priv->watchdog_secs / 5, 1)); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + pthread_mutex_lock(&priv->mutex); + for (i = 0; i < GF_FOP_PRI_MAX; ++i) { + if (priv->queue_marked[i]) { + if (++bad_times[i] >= 5) { + gf_log(this->name, GF_LOG_WARNING, "queue %d stalled", i); + iot_apply_event(this, &thresholds[i]); + /* + * We might not get here if the event + * put us over our threshold. + */ + ++(priv->ac_iot_limit[i]); + bad_times[i] = 0; } + } else { + bad_times[i] = 0; + } + priv->queue_marked[i] = (priv->queue_sizes[i] > 0); } - return 0; -} - + pthread_mutex_unlock(&priv->mutex); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } -int -iot_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict); - return 0; + /* NOTREACHED */ + return NULL; } - -int -iot_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) +static void +start_iot_watchdog(xlator_t *this) { - STACK_WIND (frame, iot_getxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->getxattr, loc, name); - return 0; -} - - -int -iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create getxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); - -out: - if (ret < 0) { - STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + iot_conf_t *priv = this->private; + int ret; + if (priv->watchdog_running) { + return; + } -int -iot_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict); - return 0; + ret = pthread_create(&priv->watchdog_thread, NULL, iot_watchdog, this); + if (ret == 0) { + priv->watchdog_running = _gf_true; + } else { + gf_log(this->name, GF_LOG_WARNING, + "pthread_create(iot_watchdog) failed"); + } } - -int -iot_fgetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name) +static void +stop_iot_watchdog(xlator_t *this) { - STACK_WIND (frame, iot_fgetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fgetxattr, fd, name); - return 0; -} + iot_conf_t *priv = this->private; + if (!priv->watchdog_running) { + return; + } -int -iot_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fgetxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (pthread_cancel(priv->watchdog_thread) != 0) { + gf_log(this->name, GF_LOG_WARNING, + "pthread_cancel(iot_watchdog) failed"); + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fgetxattr, frame, -1, -ret, NULL); + if (pthread_join(priv->watchdog_thread, NULL) != 0) { + gf_log(this->name, GF_LOG_WARNING, "pthread_join(iot_watchdog) failed"); + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + /* Failure probably means it's already dead. */ + priv->watchdog_running = _gf_false; } - int -iot_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +reconfigure(xlator_t *this, dict_t *options) { - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno); - return 0; -} + iot_conf_t *conf = NULL; + int ret = -1; + conf = this->private; + if (!conf) + goto out; -int -iot_fsetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int32_t flags) -{ - STACK_WIND (frame, iot_fsetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags); - return 0; -} + GF_OPTION_RECONF("thread-count", conf->max_count, options, int32, out); + GF_OPTION_RECONF("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI], + options, int32, out); -int -iot_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fsetxattr_stub (frame, iot_fsetxattr_wrapper, fd, dict, - flags); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fsetxattr stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fsetxattr, frame, -1, -ret); + GF_OPTION_RECONF("normal-prio-threads", + conf->ac_iot_limit[GF_FOP_PRI_NORMAL], options, int32, + out); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + GF_OPTION_RECONF("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO], + options, int32, out); + GF_OPTION_RECONF("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST], + options, int32, out); -int -iot_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno); - return 0; -} + GF_OPTION_RECONF("enable-least-priority", conf->least_priority, options, + bool, out); + GF_OPTION_RECONF("cleanup-disconnected-reqs", + conf->cleanup_disconnected_reqs, options, bool, out); -int -iot_removexattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - STACK_WIND (frame, iot_removexattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->removexattr, loc, name); - return 0; -} + GF_OPTION_RECONF("watchdog-secs", conf->watchdog_secs, options, int32, out); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); -int -iot_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_removexattr_stub (frame, iot_removexattr_wrapper, loc, - name); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR,"cannot get removexattr fop" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (conf->watchdog_secs > 0) { + start_iot_watchdog(this); + } else { + stop_iot_watchdog(this); + } - ret = iot_schedule (frame, this, stub); + ret = 0; out: - if (ret < 0) { - STACK_UNWIND_STRICT (removexattr, frame, -1, -ret); - - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} - - -int -iot_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) -{ - STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries); - return 0; + return ret; } - int -iot_readdirp_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) +init(xlator_t *this) { - STACK_WIND (frame, iot_readdirp_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readdirp, fd, size, offset); - return 0; -} + iot_conf_t *conf = NULL; + int ret = -1; + int i = 0; + if (!this->children || this->children->next) { + gf_smsg("io-threads", GF_LOG_ERROR, 0, + IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, NULL); + goto out; + } -int -iot_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readdirp_stub (frame, iot_readdirp_wrapper, fd, size, - offset); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (!this->parents) { + gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_VOL_MISCONFIGURED, + NULL); + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readdirp, frame, -1, -ret, NULL); + conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_iot_mt_iot_conf_t); + if (conf == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_OUT_OF_MEMORY, + NULL); + goto out; + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + "pthread_cond_init ret=%d", ret, NULL); + goto out; + } + conf->cond_inited = _gf_true; + if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED, + "pthread_mutex_init ret=%d", ret, NULL); + goto out; + } + conf->mutex_inited = _gf_true; -int -iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) -{ - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries); - return 0; -} + ret = set_stack_size(conf); + if (ret != 0) + goto out; -int -iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - size_t size, off_t offset) -{ - STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readdir, fd, size, offset); - return 0; -} + ret = -1; + GF_OPTION_INIT("thread-count", conf->max_count, int32, out); -int -iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + GF_OPTION_INIT("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI], + int32, out); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (readdir, frame, -1, -ret, NULL); + GF_OPTION_INIT("normal-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_NORMAL], + int32, out); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + GF_OPTION_INIT("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO], int32, + out); -int -iot_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno); - return 0; -} + GF_OPTION_INIT("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST], + int32, out); + GF_OPTION_INIT("idle-time", conf->idle_time, int32, out); -int -iot_inodelk_wrapper (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - STACK_WIND (frame, iot_inodelk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->inodelk, volume, loc, cmd, lock); - return 0; -} + GF_OPTION_INIT("enable-least-priority", conf->least_priority, bool, out); + GF_OPTION_INIT("cleanup-disconnected-reqs", conf->cleanup_disconnected_reqs, + bool, out); -int -iot_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - call_stub_t *stub = NULL; - int ret = -1; + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); - stub = fop_inodelk_stub (frame, iot_inodelk_wrapper, - volume, loc, cmd, lock); - if (!stub) { - ret = -ENOMEM; - goto out; - } + conf->this = this; + GF_ATOMIC_INIT(conf->stub_cnt, 0); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (inodelk, frame, -1, -ret); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + INIT_LIST_HEAD(&conf->clients[i]); + INIT_LIST_HEAD(&conf->no_client[i].clients); + INIT_LIST_HEAD(&conf->no_client[i].reqs); + } - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + if (!this->pass_through) { + ret = iot_workers_scale(conf); -int -iot_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno); - return 0; -} - - -int -iot_finodelk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *lock) -{ - STACK_WIND (frame, iot_finodelk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->finodelk, volume, fd, cmd, lock); - return 0; -} - - -int -iot_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_finodelk_stub (frame, iot_finodelk_wrapper, - volume, fd, cmd, lock); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get finodelk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (finodelk, frame, -1, -ret); - - if (stub != NULL) { - call_stub_destroy (stub); - } + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED, NULL); + goto out; } - return 0; -} - -int -iot_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno); - return 0; -} - - -int -iot_entrylk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - STACK_WIND (frame, iot_entrylk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->entrylk, - volume, loc, basename, cmd, type); - return 0; -} + } + this->private = conf; -int -iot_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_entrylk_stub (frame, iot_entrylk_wrapper, - volume, loc, basename, cmd, type); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get entrylk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + conf->watchdog_secs = 0; + GF_OPTION_INIT("watchdog-secs", conf->watchdog_secs, int32, out); + if (conf->watchdog_secs > 0) { + start_iot_watchdog(this); + } - ret = iot_schedule (frame, this, stub); + ret = 0; out: - if (ret < 0) { - STACK_UNWIND_STRICT (entrylk, frame, -1, -ret); + if (ret) + GF_FREE(conf); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; + return ret; } -int -iot_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static void +iot_exit_threads(iot_conf_t *conf) { - STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno); - return 0; + pthread_mutex_lock(&conf->mutex); + { + conf->down = _gf_true; + /*Let all the threads know that xl is going down*/ + pthread_cond_broadcast(&conf->cond); + while (conf->curr_count) /*Wait for threads to exit*/ + pthread_cond_wait(&conf->cond, &conf->mutex); + } + pthread_mutex_unlock(&conf->mutex); } - int -iot_fentrylk_wrapper (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) +notify(xlator_t *this, int32_t event, void *data, ...) { - STACK_WIND (frame, iot_fentrylk_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fentrylk, - volume, fd, basename, cmd, type); - return 0; -} - - -int -iot_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fentrylk_stub (frame, iot_fentrylk_wrapper, - volume, fd, basename, cmd, type); - if (!stub) { - gf_log (this->private, GF_LOG_ERROR,"cannot get fentrylk stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fentrylk, frame, -1, -ret); + iot_conf_t *conf = this->private; + xlator_t *victim = data; + uint64_t stub_cnt = 0; + struct timespec sleep_till = { + 0, + }; - if (stub != NULL) { - call_stub_destroy (stub); + if (GF_EVENT_PARENT_DOWN == event) { + if (victim->cleanup_starting) { + /* Wait for draining stub from queue before notify PARENT_DOWN */ + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); + if (stub_cnt) { + timespec_now_realtime(&sleep_till); + sleep_till.tv_sec += 1; + pthread_mutex_lock(&conf->mutex); + { + while (stub_cnt) { + (void)pthread_cond_timedwait(&conf->cond, &conf->mutex, + &sleep_till); + stub_cnt = GF_ATOMIC_GET(conf->stub_cnt); + } } - } - return 0; -} - - -int -iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr); - return 0; -} - + pthread_mutex_unlock(&conf->mutex); + } -int -iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr); - return 0; -} - - -int -iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype, - xattr); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name); + } else { + iot_exit_threads(conf); } + } - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (xattrop, frame, -1, -ret, NULL); - - if (stub != NULL) { - call_stub_destroy (stub); - } + if (GF_EVENT_CHILD_DOWN == event) { + if (victim->cleanup_starting) { + iot_exit_threads(conf); + gf_log(this->name, GF_LOG_INFO, + "Notify GF_EVENT_CHILD_DOWN for brick %s", victim->name); } - return 0; -} + } + default_notify(this, event, data); -int -iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, xattr); - return 0; + return 0; } -int -iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) +void +fini(xlator_t *this) { - STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr); - return 0; -} + iot_conf_t *conf = this->private; + if (!conf) + return; -int -iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype, - xattr); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } + if (conf->mutex_inited && conf->cond_inited) + iot_exit_threads(conf); - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (fxattrop, frame, -1, -ret, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - return 0; -} + if (conf->cond_inited) + pthread_cond_destroy(&conf->cond); + if (conf->mutex_inited) + pthread_mutex_destroy(&conf->mutex); -int32_t -iot_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, - uint8_t *strong_checksum) -{ - STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, - strong_checksum); - return 0; -} + stop_iot_watchdog(this); + GF_FREE(conf); -int32_t -iot_rchecksum_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, int32_t len) -{ - STACK_WIND (frame, iot_rchecksum_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rchecksum, fd, offset, len); - return 0; + this->private = NULL; + return; } - -int32_t -iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - int32_t len) -{ - call_stub_t *stub = NULL; - int ret = -1; - - stub = fop_rchecksum_stub (frame, iot_rchecksum_wrapper, fd, offset, - len); - if (!stub) { - gf_log (this->name, GF_LOG_ERROR, "cannot create rchecksum stub" - "(out of memory)"); - ret = -ENOMEM; - goto out; - } - - ret = iot_schedule (frame, this, stub); -out: - if (ret < 0) { - STACK_UNWIND_STRICT (rchecksum, frame, -1, -ret, -1, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - } - - return 0; -} - - int -__iot_workers_scale (iot_conf_t *conf) +iot_client_destroy(xlator_t *this, client_t *client) { - int log2 = 0; - int scale = 0; - int diff = 0; - pthread_t thread; - int ret = 0; - - log2 = log_base2 (conf->queue_size); + void *tmp = NULL; - scale = log2; + if (client_ctx_del(client, this, &tmp) == 0) { + GF_FREE(tmp); + } - if (log2 < IOT_MIN_THREADS) - scale = IOT_MIN_THREADS; - - if (log2 > conf->max_count) - scale = conf->max_count; - - if (conf->curr_count < scale) { - diff = scale - conf->curr_count; - } - - while (diff) { - diff --; - - ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf); - if (ret == 0) { - conf->curr_count++; - gf_log (conf->this->name, GF_LOG_DEBUG, - "scaled threads to %d (queue_size=%d/%d)", - conf->curr_count, conf->queue_size, scale); - } else { - break; - } - } - - return diff; + return 0; } - -int -iot_workers_scale (iot_conf_t *conf) +static int +iot_disconnect_cbk(xlator_t *this, client_t *client) { - int ret = -1; + int i; + call_stub_t *curr; + call_stub_t *next; + iot_conf_t *conf = this->private; + iot_client_ctx_t *ctx; - if (conf == NULL) { - ret = -EINVAL; - goto out; - } + if (!conf || !conf->cleanup_disconnected_reqs) { + goto out; + } - pthread_mutex_lock (&conf->mutex); + pthread_mutex_lock(&conf->mutex); + for (i = 0; i < GF_FOP_PRI_MAX; i++) { + ctx = &conf->no_client[i]; + list_for_each_entry_safe(curr, next, &ctx->reqs, list) { - ret = __iot_workers_scale (conf); + if (curr->frame->root->client != client) { + continue; + } + gf_log(this->name, GF_LOG_INFO, + "poisoning %s fop at %p for client %s", + gf_fop_list[curr->fop], curr, client->client_uid); + curr->poison = _gf_true; } - pthread_mutex_unlock (&conf->mutex); + } + pthread_mutex_unlock(&conf->mutex); out: - return ret; -} - - -void -set_stack_size (iot_conf_t *conf) -{ - int err = 0; - size_t stacksize = IOT_THREAD_STACK_SIZE; - - pthread_attr_init (&conf->w_attr); - err = pthread_attr_setstacksize (&conf->w_attr, stacksize); - if (err == EINVAL) { - gf_log (conf->this->name, GF_LOG_WARNING, - "Using default thread stack size"); - } -} - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_iot_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - - -int -reconfigure (xlator_t *this, dict_t *options) -{ - iot_conf_t *conf = NULL; - int ret = -1; - - conf = this->private; - if (!conf) - goto out; - - GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); - - GF_OPTION_RECONF ("high-prio-threads", - conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); - - GF_OPTION_RECONF ("normal-prio-threads", - conf->ac_iot_limit[IOT_PRI_NORMAL], options, int32, - out); - - GF_OPTION_RECONF ("low-prio-threads", - conf->ac_iot_limit[IOT_PRI_LO], options, int32, out); - - GF_OPTION_RECONF ("least-prio-threads", - conf->ac_iot_limit[IOT_PRI_LEAST], options, int32, - out); - - ret = 0; -out: - return ret; -} - - -int -init (xlator_t *this) -{ - iot_conf_t *conf = NULL; - int ret = -1; - int i = 0; - - if (!this->children || this->children->next) { - gf_log ("io-threads", GF_LOG_ERROR, - "FATAL: iot not configured with exactly one child"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - conf = (void *) GF_CALLOC (1, sizeof (*conf), - gf_iot_mt_iot_conf_t); - if (conf == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto out; - } - - if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_cond_init failed (%d)", ret); - goto out; - } - - if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "pthread_mutex_init failed (%d)", ret); - goto out; - } - - set_stack_size (conf); - - GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); - - GF_OPTION_INIT ("high-prio-threads", - conf->ac_iot_limit[IOT_PRI_HI], int32, out); - - GF_OPTION_INIT ("normal-prio-threads", - conf->ac_iot_limit[IOT_PRI_NORMAL], int32, out); - - GF_OPTION_INIT ("low-prio-threads", - conf->ac_iot_limit[IOT_PRI_LO], int32, out); - - GF_OPTION_INIT ("least-prio-threads", - conf->ac_iot_limit[IOT_PRI_LEAST], int32, out); - - GF_OPTION_INIT ("idle-time", conf->idle_time, int32, out); - - conf->this = this; - - for (i = 0; i < IOT_PRI_MAX; i++) { - INIT_LIST_HEAD (&conf->reqs[i]); - } - - ret = iot_workers_scale (conf); - - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "cannot initialize worker threads, exiting init"); - GF_FREE (conf); - goto out; - } - - this->private = conf; - ret = 0; -out: - return ret; -} - - -void -fini (xlator_t *this) -{ - iot_conf_t *conf = this->private; - - GF_FREE (conf); - - this->private = NULL; - return; + return 0; } +struct xlator_dumpops dumpops = { + .priv = iot_priv_dump, +}; struct xlator_fops fops = { - .open = iot_open, - .create = iot_create, - .readv = iot_readv, - .writev = iot_writev, - .flush = iot_flush, - .fsync = iot_fsync, - .lk = iot_lk, - .stat = iot_stat, - .fstat = iot_fstat, - .truncate = iot_truncate, - .ftruncate = iot_ftruncate, - .unlink = iot_unlink, - .lookup = iot_lookup, - .setattr = iot_setattr, - .fsetattr = iot_fsetattr, - .access = iot_access, - .readlink = iot_readlink, - .mknod = iot_mknod, - .mkdir = iot_mkdir, - .rmdir = iot_rmdir, - .symlink = iot_symlink, - .rename = iot_rename, - .link = iot_link, - .opendir = iot_opendir, - .fsyncdir = iot_fsyncdir, - .statfs = iot_statfs, - .setxattr = iot_setxattr, - .getxattr = iot_getxattr, - .fgetxattr = iot_fgetxattr, - .fsetxattr = iot_fsetxattr, - .removexattr = iot_removexattr, - .readdir = iot_readdir, - .readdirp = iot_readdirp, - .inodelk = iot_inodelk, - .finodelk = iot_finodelk, - .entrylk = iot_entrylk, - .fentrylk = iot_fentrylk, - .xattrop = iot_xattrop, - .fxattrop = iot_fxattrop, - .rchecksum = iot_rchecksum, + .open = iot_open, + .create = iot_create, + .readv = iot_readv, + .writev = iot_writev, + .flush = iot_flush, + .fsync = iot_fsync, + .lk = iot_lk, + .stat = iot_stat, + .fstat = iot_fstat, + .truncate = iot_truncate, + .ftruncate = iot_ftruncate, + .unlink = iot_unlink, + .lookup = iot_lookup, + .setattr = iot_setattr, + .fsetattr = iot_fsetattr, + .access = iot_access, + .readlink = iot_readlink, + .mknod = iot_mknod, + .mkdir = iot_mkdir, + .rmdir = iot_rmdir, + .symlink = iot_symlink, + .rename = iot_rename, + .link = iot_link, + .opendir = iot_opendir, + .fsyncdir = iot_fsyncdir, + .statfs = iot_statfs, + .setxattr = iot_setxattr, + .getxattr = iot_getxattr, + .fgetxattr = iot_fgetxattr, + .fsetxattr = iot_fsetxattr, + .removexattr = iot_removexattr, + .fremovexattr = iot_fremovexattr, + .readdir = iot_readdir, + .readdirp = iot_readdirp, + .inodelk = iot_inodelk, + .finodelk = iot_finodelk, + .entrylk = iot_entrylk, + .fentrylk = iot_fentrylk, + .xattrop = iot_xattrop, + .fxattrop = iot_fxattrop, + .rchecksum = iot_rchecksum, + .fallocate = iot_fallocate, + .discard = iot_discard, + .zerofill = iot_zerofill, + .seek = iot_seek, + .lease = iot_lease, + .getactivelk = iot_getactivelk, + .setactivelk = iot_setactivelk, + .put = iot_put, }; struct xlator_cbks cbks = { + .client_destroy = iot_client_destroy, + .client_disconnect = iot_disconnect_cbk, }; struct volume_options options[] = { - { .key = {"thread-count"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Number of threads in IO threads translator which " - "perform concurrent IO operations" - - }, - { .key = {"high-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform high priority IO operations at a given time" - - }, - { .key = {"normal-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform normal priority IO operations at a given time" - - }, - { .key = {"low-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "16", - .description = "Max number of threads in IO threads translator which " - "perform low priority IO operations at a given time" - - }, - { .key = {"least-prio-threads"}, - .type = GF_OPTION_TYPE_INT, - .min = IOT_MIN_THREADS, - .max = IOT_MAX_THREADS, - .default_value = "1", - .description = "Max number of threads in IO threads translator which " - "perform least priority IO operations at a given time" - }, - {.key = {"idle-time"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 0x7fffffff, - .default_value = "120", - }, - { .key = {NULL}, - }, + {.key = {"thread-count"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + /*.option = "thread-count"*/ + .description = "Number of threads in IO threads translator which " + "perform concurrent IO operations" + + }, + {.key = {"high-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform high priority IO operations at a given time" + + }, + {.key = {"normal-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform normal priority IO operations at a given time" + + }, + {.key = {"low-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "16", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform low priority IO operations at a given time" + + }, + {.key = {"least-prio-threads"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_THREADS, + .max = IOT_MAX_THREADS, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Max number of threads in IO threads translator which " + "perform least priority IO operations at a given time"}, + {.key = {"enable-least-priority"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = SITE_H_ENABLE_LEAST_PRIORITY, + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Enable/Disable least priority"}, + { + .key = {"idle-time"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "120", + }, + {.key = {"watchdog-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = 0, + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .tags = {"io-threads"}, + .description = "Number of seconds a queue must be stalled before " + "starting an 'emergency' thread."}, + {.key = {"cleanup-disconnected-reqs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-threads"}, + .description = "'Poison' queued requests when a client disconnects"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"io-threads"}, + .description = "Enable/Disable io threads translator"}, + { + .key = {NULL}, + }, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .notify = notify, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "io-threads", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index d09fec94d8e..f54d2f4912d 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -1,85 +1,84 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __IOT_H #define __IOT_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "compat-errno.h" -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" -#include "list.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/list.h> #include <stdlib.h> -#include "locking.h" +#include <glusterfs/locking.h> #include "iot-mem-types.h" #include <semaphore.h> - +#include <glusterfs/statedump.h> struct iot_conf; -#define MAX_IDLE_SKEW 4 /* In secs */ -#define skew_sec_idle_time(sec) ((sec) + (random () % MAX_IDLE_SKEW)) -#define IOT_DEFAULT_IDLE 120 /* In secs. */ - -#define IOT_MIN_THREADS 1 -#define IOT_DEFAULT_THREADS 16 -#define IOT_MAX_THREADS 64 - +#define MAX_IDLE_SKEW 4 /* In secs */ +#define skew_sec_idle_time(sec) ((sec) + (random() % MAX_IDLE_SKEW)) +#define IOT_DEFAULT_IDLE 120 /* In secs. */ -#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) +#define IOT_MIN_THREADS 1 +#define IOT_DEFAULT_THREADS 16 +#define IOT_MAX_THREADS 64 +#define IOT_THREAD_STACK_SIZE ((size_t)(256 * 1024)) -typedef enum { - IOT_PRI_HI = 0, /* low latency */ - IOT_PRI_NORMAL, /* normal */ - IOT_PRI_LO, /* bulk */ - IOT_PRI_LEAST, /* least */ - IOT_PRI_MAX, -} iot_pri_t; - +typedef struct { + struct list_head clients; + struct list_head reqs; +} iot_client_ctx_t; struct iot_conf { - pthread_mutex_t mutex; - pthread_cond_t cond; - - int32_t max_count; /* configured maximum */ - int32_t curr_count; /* actual number of threads running */ - int32_t sleep_count; - - int32_t idle_time; /* in seconds */ - - struct list_head reqs[IOT_PRI_MAX]; - - int32_t ac_iot_limit[IOT_PRI_MAX]; - int32_t ac_iot_count[IOT_PRI_MAX]; - int queue_size; - pthread_attr_t w_attr; - - xlator_t *this; + pthread_mutex_t mutex; + pthread_cond_t cond; + + int32_t max_count; /* configured maximum */ + int32_t curr_count; /* actual number of threads running */ + int32_t sleep_count; + + int32_t idle_time; /* in seconds */ + + struct list_head clients[GF_FOP_PRI_MAX]; + /* + * It turns out that there are several ways a frame can get to us + * without having an associated client (server_first_lookup was the + * first one I hit). Instead of trying to update all such callers, + * we use this to queue them. + */ + iot_client_ctx_t no_client[GF_FOP_PRI_MAX]; + + int32_t ac_iot_limit[GF_FOP_PRI_MAX]; + int32_t ac_iot_count[GF_FOP_PRI_MAX]; + int queue_sizes[GF_FOP_PRI_MAX]; + int32_t queue_size; + gf_atomic_t stub_cnt; + pthread_attr_t w_attr; + gf_boolean_t least_priority; /*Enable/Disable least-priority */ + + xlator_t *this; + size_t stack_size; + gf_boolean_t down; /*PARENT_DOWN event is notified*/ + gf_boolean_t mutex_inited; + gf_boolean_t cond_inited; + + int32_t watchdog_secs; + gf_boolean_t watchdog_running; + pthread_t watchdog_thread; + gf_boolean_t queue_marked[GF_FOP_PRI_MAX]; + gf_boolean_t cleanup_disconnected_reqs; }; typedef struct iot_conf iot_conf_t; diff --git a/xlators/performance/io-threads/src/iot-mem-types.h b/xlators/performance/io-threads/src/iot-mem-types.h index 14400cd083c..29565f34dd4 100644 --- a/xlators/performance/io-threads/src/iot-mem-types.h +++ b/xlators/performance/io-threads/src/iot-mem-types.h @@ -1,31 +1,21 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __IOT_MEM_TYPES_H__ #define __IOT_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_iot_mem_types_ { - gf_iot_mt_iot_conf_t = gf_common_mt_end + 1, - gf_iot_mt_end + gf_iot_mt_iot_conf_t = gf_common_mt_end + 1, + gf_iot_mt_client_ctx_t, + gf_iot_mt_end }; #endif - diff --git a/xlators/performance/stat-prefetch/Makefile.am b/xlators/performance/md-cache/Makefile.am index af437a64d6d..af437a64d6d 100644 --- a/xlators/performance/stat-prefetch/Makefile.am +++ b/xlators/performance/md-cache/Makefile.am diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am new file mode 100644 index 00000000000..447ff0f30f0 --- /dev/null +++ b/xlators/performance/md-cache/src/Makefile.am @@ -0,0 +1,29 @@ +xlator_LTLIBRARIES = md-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +md_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +md_cache_la_SOURCES = md-cache.c +md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = md-cache-mem-types.h md-cache-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(CONTRIBDIR)/rbtree + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + + +stat-prefetch-compat: + mkdir -p $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + rm -rf $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so + ln -s ./md-cache.so $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so + + +install-exec-local: stat-prefetch-compat + +uninstall-local: + rm -f $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so diff --git a/xlators/performance/md-cache/src/md-cache-mem-types.h b/xlators/performance/md-cache/src/md-cache-mem-types.h new file mode 100644 index 00000000000..47a07005717 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache-mem-types.h @@ -0,0 +1,23 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __MDC_MEM_TYPES_H__ +#define __MDC_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_mdc_mem_types_ { + gf_mdc_mt_mdc_local_t = gf_common_mt_end + 1, + gf_mdc_mt_md_cache_t, + gf_mdc_mt_mdc_conf_t, + gf_mdc_mt_mdc_ipc, + gf_mdc_mt_end +}; +#endif diff --git a/xlators/performance/md-cache/src/md-cache-messages.h b/xlators/performance/md-cache/src/md-cache-messages.h new file mode 100644 index 00000000000..f367bad1991 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache-messages.h @@ -0,0 +1,29 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _MD_CACHE_MESSAGES_H_ +#define _MD_CACHE_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(MD_CACHE, MD_CACHE_MSG_NO_MEMORY, MD_CACHE_MSG_DISCARD_UPDATE, + MD_CACHE_MSG_CACHE_UPDATE, MD_CACHE_MSG_IPC_UPCALL_FAILED, + MD_CACHE_MSG_NO_XATTR_CACHE); + +#endif /* _MD_CACHE_MESSAGES_H_ */ diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c new file mode 100644 index 00000000000..a405be51f02 --- /dev/null +++ b/xlators/performance/md-cache/src/md-cache.c @@ -0,0 +1,4020 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <glusterfs/glusterfs.h> +#include <glusterfs/defaults.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/syncop.h> +#include "md-cache-mem-types.h" +#include <glusterfs/compat-errno.h> +#include <glusterfs/glusterfs-acl.h> +#include <glusterfs/defaults.h> +#include <glusterfs/upcall-utils.h> +#include <assert.h> +#include <sys/time.h> +#include "md-cache-messages.h" +#include <glusterfs/statedump.h> +#include <glusterfs/atomic.h> + +/* TODO: + - cache symlink() link names and nuke symlink-cache + - send proper postbuf in setattr_cbk even when op_ret = -1 +*/ + +struct mdc_statfs_cache { + pthread_mutex_t lock; + time_t last_refreshed; /* (time_t)-1 if not yet initialized. */ + struct statvfs buf; +}; + +struct mdc_statistics { + gf_atomic_t stat_hit; /* No. of times lookup/stat was served from + mdc */ + + gf_atomic_t stat_miss; /* No. of times valid stat wasn't present in + mdc */ + + gf_atomic_t xattr_hit; /* No. of times getxattr was served from mdc, + Note: this doesn't count the xattr served + from lookup */ + + gf_atomic_t xattr_miss; /* No. of times xattr req was WIND from mdc */ + gf_atomic_t negative_lookup; /* No. of negative lookups */ + gf_atomic_t nameless_lookup; /* No. of negative lookups that were sent + to bricks */ + + gf_atomic_t stat_invals; /* No. of invalidates received from upcall */ + gf_atomic_t xattr_invals; /* No. of invalidates received from upcall */ + gf_atomic_t need_lookup; /* No. of lookups issued, because other + xlators requested for explicit lookup */ +}; + +struct mdc_conf { + uint32_t timeout; + gf_boolean_t cache_posix_acl; + gf_boolean_t cache_glusterfs_acl; + gf_boolean_t cache_selinux; + gf_boolean_t cache_capability; + gf_boolean_t cache_ima; + gf_boolean_t force_readdirp; + gf_boolean_t cache_swift_metadata; + gf_boolean_t cache_samba_metadata; + gf_boolean_t mdc_invalidation; + gf_boolean_t global_invalidation; + + time_t last_child_down; + gf_lock_t lock; + struct mdc_statistics mdc_counter; + gf_boolean_t cache_statfs; + struct mdc_statfs_cache statfs_cache; + char *mdc_xattr_str; + gf_atomic_int32_t generation; +}; + +struct mdc_local; +typedef struct mdc_local mdc_local_t; + +#define MDC_STACK_UNWIND(fop, frame, params...) \ + do { \ + mdc_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + mdc_local_wipe(__xl, __local); \ + } while (0) + +struct md_cache { + ia_prot_t md_prot; + uint32_t md_nlink; + uint32_t md_uid; + uint32_t md_gid; + uint32_t md_atime_nsec; + uint32_t md_mtime_nsec; + uint32_t md_ctime_nsec; + int64_t md_atime; + int64_t md_mtime; + int64_t md_ctime; + uint64_t md_rdev; + uint64_t md_size; + uint64_t md_blocks; + uint64_t generation; + dict_t *xattr; + char *linkname; + time_t ia_time; + time_t xa_time; + gf_boolean_t need_lookup; + gf_boolean_t valid; + gf_boolean_t gen_rollover; + gf_boolean_t invalidation_rollover; + gf_lock_t lock; +}; + +struct mdc_local { + loc_t loc; + loc_t loc2; + fd_t *fd; + char *linkname; + char *key; + dict_t *xattr; + uint64_t incident_time; + bool update_cache; +}; + +int +__mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +{ + int ret = 0; + struct md_cache *mdc = NULL; + uint64_t mdc_int = 0; + + ret = __inode_ctx_get(inode, this, &mdc_int); + mdc = (void *)(long)(mdc_int); + if (ret == 0 && mdc_p) + *mdc_p = mdc; + + return ret; +} + +int +mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p) +{ + int ret = -1; + + if (!inode) + goto out; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_get(this, inode, mdc_p); + } + UNLOCK(&inode->lock); + +out: + return ret; +} + +uint64_t +__mdc_inc_generation(xlator_t *this, struct md_cache *mdc) +{ + uint64_t gen = 0, rollover; + struct mdc_conf *conf = NULL; + + conf = this->private; + + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + mdc->gen_rollover = !mdc->gen_rollover; + gen = GF_ATOMIC_INC(conf->generation); + mdc->ia_time = 0; + mdc->generation = 0; + } + + rollover = mdc->gen_rollover; + gen |= (rollover << 32); + return gen; +} + +uint64_t +mdc_inc_generation(xlator_t *this, inode_t *inode) +{ + struct mdc_conf *conf = NULL; + uint64_t gen = 0; + struct md_cache *mdc = NULL; + + conf = this->private; + + mdc_inode_ctx_get(this, inode, &mdc); + + if (mdc) { + LOCK(&mdc->lock); + { + gen = __mdc_inc_generation(this, mdc); + } + UNLOCK(&mdc->lock); + } else { + gen = GF_ATOMIC_INC(conf->generation); + if (gen == 0) { + gen = GF_ATOMIC_INC(conf->generation); + } + } + + return gen; +} + +uint64_t +mdc_get_generation(xlator_t *this, inode_t *inode) +{ + struct mdc_conf *conf = NULL; + uint64_t gen = 0; + struct md_cache *mdc = NULL; + + conf = this->private; + + mdc_inode_ctx_get(this, inode, &mdc); + + if (mdc) { + LOCK(&mdc->lock); + { + gen = mdc->generation; + } + UNLOCK(&mdc->lock); + } else + gen = GF_ATOMIC_GET(conf->generation); + + return gen; +} + +int +__mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc) +{ + int ret = 0; + uint64_t mdc_int = 0; + + mdc_int = (long)mdc; + ret = __inode_ctx_set(inode, this, &mdc_int); + + return ret; +} + +int +mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc) +{ + int ret; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_set(this, inode, mdc); + } + UNLOCK(&inode->lock); + + return ret; +} + +mdc_local_t * +mdc_local_get(call_frame_t *frame, inode_t *inode) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (local) + goto out; + + local = GF_CALLOC(sizeof(*local), 1, gf_mdc_mt_mdc_local_t); + if (!local) + goto out; + + local->incident_time = mdc_get_generation(frame->this, inode); + frame->local = local; +out: + return local; +} + +void +mdc_local_wipe(xlator_t *this, mdc_local_t *local) +{ + if (!local) + return; + + loc_wipe(&local->loc); + + loc_wipe(&local->loc2); + + if (local->fd) + fd_unref(local->fd); + + GF_FREE(local->linkname); + + GF_FREE(local->key); + + if (local->xattr) + dict_unref(local->xattr); + + GF_FREE(local); + return; +} + +int +mdc_inode_wipe(xlator_t *this, inode_t *inode) +{ + int ret = 0; + uint64_t mdc_int = 0; + struct md_cache *mdc = NULL; + + ret = inode_ctx_del(inode, this, &mdc_int); + if (ret != 0) + goto out; + + mdc = (void *)(long)mdc_int; + + if (mdc->xattr) + dict_unref(mdc->xattr); + + GF_FREE(mdc->linkname); + + GF_FREE(mdc); + + ret = 0; +out: + return ret; +} + +struct md_cache * +mdc_inode_prep(xlator_t *this, inode_t *inode) +{ + int ret = 0; + struct md_cache *mdc = NULL; + + LOCK(&inode->lock); + { + ret = __mdc_inode_ctx_get(this, inode, &mdc); + if (ret == 0) + goto unlock; + + mdc = GF_CALLOC(sizeof(*mdc), 1, gf_mdc_mt_md_cache_t); + if (!mdc) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + goto unlock; + } + + LOCK_INIT(&mdc->lock); + + ret = __mdc_inode_ctx_set(this, inode, mdc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + GF_FREE(mdc); + mdc = NULL; + } + } +unlock: + UNLOCK(&inode->lock); + + return mdc; +} + +/* Cache is valid if: + * - It is not cached before any brick was down. Brick down case is handled by + * invalidating all the cache when any brick went down. + * - The cache time is not expired + */ +static gf_boolean_t +__is_cache_valid(xlator_t *this, time_t mdc_time) +{ + gf_boolean_t ret = _gf_true; + struct mdc_conf *conf = NULL; + uint32_t timeout = 0; + time_t last_child_down = 0; + + conf = this->private; + + /* conf->lock here is not taken deliberately, so that the multi + * threaded IO doesn't contend on a global lock. While updating + * the variable, the lock is taken, so that at least the writes are + * intact. The read of last_child_down may return junk, but that + * is for a very short period of time. + */ + last_child_down = conf->last_child_down; + timeout = conf->timeout; + + if ((mdc_time == 0) || + ((last_child_down != 0) && (mdc_time < last_child_down))) { + ret = _gf_false; + goto out; + } + + if (gf_time() >= (mdc_time + timeout)) { + ret = _gf_false; + } + +out: + return ret; +} + +static gf_boolean_t +is_md_cache_iatt_valid(xlator_t *this, struct md_cache *mdc) +{ + gf_boolean_t ret = _gf_true; + + LOCK(&mdc->lock); + { + if (mdc->valid == _gf_false) { + ret = mdc->valid; + } else { + ret = __is_cache_valid(this, mdc->ia_time); + if (ret == _gf_false) { + mdc->ia_time = 0; + mdc->generation = 0; + } + } + } + UNLOCK(&mdc->lock); + + return ret; +} + +static gf_boolean_t +is_md_cache_xatt_valid(xlator_t *this, struct md_cache *mdc) +{ + gf_boolean_t ret = _gf_true; + + LOCK(&mdc->lock); + { + ret = __is_cache_valid(this, mdc->xa_time); + if (ret == _gf_false) + mdc->xa_time = 0; + } + UNLOCK(&mdc->lock); + + return ret; +} + +void +mdc_from_iatt(struct md_cache *mdc, struct iatt *iatt) +{ + mdc->md_prot = iatt->ia_prot; + mdc->md_nlink = iatt->ia_nlink; + mdc->md_uid = iatt->ia_uid; + mdc->md_gid = iatt->ia_gid; + mdc->md_atime = iatt->ia_atime; + mdc->md_atime_nsec = iatt->ia_atime_nsec; + mdc->md_mtime = iatt->ia_mtime; + mdc->md_mtime_nsec = iatt->ia_mtime_nsec; + mdc->md_ctime = iatt->ia_ctime; + mdc->md_ctime_nsec = iatt->ia_ctime_nsec; + mdc->md_rdev = iatt->ia_rdev; + mdc->md_size = iatt->ia_size; + mdc->md_blocks = iatt->ia_blocks; +} + +void +mdc_to_iatt(struct md_cache *mdc, struct iatt *iatt) +{ + iatt->ia_prot = mdc->md_prot; + iatt->ia_nlink = mdc->md_nlink; + iatt->ia_uid = mdc->md_uid; + iatt->ia_gid = mdc->md_gid; + iatt->ia_atime = mdc->md_atime; + iatt->ia_atime_nsec = mdc->md_atime_nsec; + iatt->ia_mtime = mdc->md_mtime; + iatt->ia_mtime_nsec = mdc->md_mtime_nsec; + iatt->ia_ctime = mdc->md_ctime; + iatt->ia_ctime_nsec = mdc->md_ctime_nsec; + iatt->ia_rdev = mdc->md_rdev; + iatt->ia_size = mdc->md_size; + iatt->ia_blocks = mdc->md_blocks; +} + +int +mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf, + struct iatt *iatt, gf_boolean_t update_time, + uint64_t incident_time) +{ + int ret = 0; + struct md_cache *mdc = NULL; + uint32_t rollover = 0; + uint64_t gen = 0; + gf_boolean_t update_xa_time = _gf_false; + struct mdc_conf *conf = this->private; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) { + ret = -1; + goto out; + } + + rollover = incident_time >> 32; + incident_time = (incident_time & 0xffffffff); + + LOCK(&mdc->lock); + { + if (!iatt || !iatt->ia_ctime) { + gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0, + "invalidating iatt(NULL)" + "(%s)", + uuid_utoa(inode->gfid)); + mdc->ia_time = 0; + mdc->valid = 0; + + gen = __mdc_inc_generation(this, mdc); + mdc->generation = (gen & 0xffffffff); + goto unlock; + } + + /* There could be a race in invalidation, where the + * invalidations in order A, B reaches md-cache in the order + * B, A. Hence, make sure the invalidation A is discarded if + * it comes after B. ctime of a file is always in ascending + * order unlike atime and mtime(which can be changed by user + * to any date), also ctime gets updates when atime/mtime + * changes, hence check for ctime only. + */ + if (mdc->md_ctime > iatt->ia_ctime) { + gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL, + MD_CACHE_MSG_DISCARD_UPDATE, + "discarding the iatt validate " + "request (%s)", + uuid_utoa(inode->gfid)); + ret = -1; + goto unlock; + } + if ((mdc->md_ctime == iatt->ia_ctime) && + (mdc->md_ctime_nsec > iatt->ia_ctime_nsec)) { + gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL, + MD_CACHE_MSG_DISCARD_UPDATE, + "discarding the iatt validate " + "request(ctime_nsec) (%s)", + uuid_utoa(inode->gfid)); + ret = -1; + goto unlock; + } + + /* + * Invalidate the inode if the mtime or ctime has changed + * and the prebuf doesn't match the value we have cached. + * TODO: writev returns with a NULL iatt due to + * performance/write-behind, causing invalidation on writes. + */ + if ((iatt->ia_mtime != mdc->md_mtime) || + (iatt->ia_mtime_nsec != mdc->md_mtime_nsec) || + (iatt->ia_ctime != mdc->md_ctime) || + (iatt->ia_ctime_nsec != mdc->md_ctime_nsec)) { + if (conf->global_invalidation && + (!prebuf || (prebuf->ia_mtime != mdc->md_mtime) || + (prebuf->ia_mtime_nsec != mdc->md_mtime_nsec) || + (prebuf->ia_ctime != mdc->md_ctime) || + (prebuf->ia_ctime_nsec != mdc->md_ctime_nsec))) { + if (IA_ISREG(inode->ia_type)) { + gf_msg("md-cache", GF_LOG_TRACE, 0, + MD_CACHE_MSG_DISCARD_UPDATE, + "prebuf doesn't match the value we have cached," + " invalidate the inode(%s)", + uuid_utoa(inode->gfid)); + + inode_invalidate(inode); + } + } else { + update_xa_time = _gf_true; + } + } + + if ((mdc->gen_rollover == rollover) && + (incident_time >= mdc->generation)) { + mdc_from_iatt(mdc, iatt); + mdc->valid = _gf_true; + if (update_time) { + mdc->ia_time = gf_time(); + if (mdc->xa_time && update_xa_time) + mdc->xa_time = mdc->ia_time; + } + + gf_msg_callingfn( + "md-cache", GF_LOG_TRACE, 0, MD_CACHE_MSG_CACHE_UPDATE, + "Updated iatt(%s)" + " time:%lld generation=%lld", + uuid_utoa(iatt->ia_gfid), (unsigned long long)mdc->ia_time, + (unsigned long long)mdc->generation); + } else { + gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0, + "not updating cache (%s)" + "mdc-rollover=%u rollover=%u " + "mdc-generation=%llu " + "mdc-ia_time=%llu incident_time=%llu ", + uuid_utoa(iatt->ia_gfid), mdc->gen_rollover, + rollover, (unsigned long long)mdc->generation, + (unsigned long long)mdc->ia_time, + (unsigned long long)incident_time); + } + } +unlock: + UNLOCK(&mdc->lock); + +out: + return ret; +} + +int +mdc_inode_iatt_set(xlator_t *this, inode_t *inode, struct iatt *iatt, + uint64_t incident_time) +{ + return mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true, + incident_time); +} + +int +mdc_inode_iatt_get(xlator_t *this, inode_t *inode, struct iatt *iatt) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) { + gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)", + uuid_utoa(inode->gfid)); + goto out; + } + + if (!is_md_cache_iatt_valid(this, mdc)) { + gf_msg_trace("md-cache", 0, "iatt cache not valid for (%s)", + uuid_utoa(inode->gfid)); + goto out; + } + + LOCK(&mdc->lock); + { + mdc_to_iatt(mdc, iatt); + } + UNLOCK(&mdc->lock); + + gf_uuid_copy(iatt->ia_gfid, inode->gfid); + iatt->ia_ino = gfid_to_ino(inode->gfid); + iatt->ia_dev = 42; + iatt->ia_type = inode->ia_type; + + ret = 0; +out: + return ret; +} + +struct updatedict { + dict_t *dict; + int ret; +}; + +static int +is_mdc_key_satisfied(xlator_t *this, const char *key) +{ + int ret = 0; + char *pattern = NULL; + struct mdc_conf *conf = this->private; + char *mdc_xattr_str = NULL; + char *tmp = NULL; + char *tmp1 = NULL; + + if (!key) + goto out; + + /* conf->mdc_xattr_str, is never freed and is hence safely used outside + * of lock*/ + tmp1 = conf->mdc_xattr_str; + if (!tmp1) + goto out; + + mdc_xattr_str = gf_strdup(tmp1); + if (!mdc_xattr_str) + goto out; + + pattern = strtok_r(mdc_xattr_str, ",", &tmp); + while (pattern) { + gf_strTrim(&pattern); + if (fnmatch(pattern, key, 0) == 0) { + ret = 1; + break; + } else { + gf_msg_trace("md-cache", 0, + "xattr key %s doesn't satisfy " + "caching requirements", + key); + } + pattern = strtok_r(NULL, ",", &tmp); + } + GF_FREE(mdc_xattr_str); +out: + return ret; +} + +static int +updatefn(dict_t *dict, char *key, data_t *value, void *data) +{ + struct updatedict *u = data; + + if (is_mdc_key_satisfied(THIS, key)) { + if (!u->dict) { + u->dict = dict_new(); + if (!u->dict) { + u->ret = -1; + return -1; + } + } + + if (dict_set(u->dict, key, value) < 0) { + u->ret = -1; + return -1; + } + } + return 0; +} + +static int +mdc_dict_update(dict_t **tgt, dict_t *src) +{ + struct updatedict u = { + .dict = *tgt, + .ret = 0, + }; + + dict_foreach(src, updatefn, &u); + + if (*tgt) + return u.ret; + + if ((u.ret < 0) && u.dict) { + dict_unref(u.dict); + return u.ret; + } + + *tgt = u.dict; + + return u.ret; +} + +int +mdc_inode_xatt_set(xlator_t *this, inode_t *inode, dict_t *dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + dict_t *newdict = NULL; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; + + if (!dict) { + gf_msg_trace("md-cache", 0, + "mdc_inode_xatt_set failed (%s) " + "dict NULL", + uuid_utoa(inode->gfid)); + goto out; + } + + LOCK(&mdc->lock); + { + if (mdc->xattr) { + gf_msg_trace("md-cache", 0, + "deleting the old xattr " + "cache (%s)", + uuid_utoa(inode->gfid)); + dict_unref(mdc->xattr); + mdc->xattr = NULL; + } + + ret = mdc_dict_update(&newdict, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; + } + + if (newdict) + mdc->xattr = newdict; + + mdc->xa_time = gf_time(); + gf_msg_trace("md-cache", 0, "xatt cache set for (%s) time:%lld", + uuid_utoa(inode->gfid), (long long)mdc->xa_time); + } + UNLOCK(&mdc->lock); + ret = 0; +out: + return ret; +} + +int +mdc_inode_xatt_update(xlator_t *this, inode_t *inode, dict_t *dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; + + if (!dict) + goto out; + + LOCK(&mdc->lock); + { + ret = mdc_dict_update(&mdc->xattr, dict); + if (ret < 0) { + UNLOCK(&mdc->lock); + goto out; + } + } + UNLOCK(&mdc->lock); + + ret = 0; +out: + return ret; +} + +int +mdc_inode_xatt_unset(xlator_t *this, inode_t *inode, char *name) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep(this, inode); + if (!mdc) + goto out; + + if (!name || !mdc->xattr) + goto out; + + LOCK(&mdc->lock); + { + dict_del(mdc->xattr, name); + } + UNLOCK(&mdc->lock); + + ret = 0; +out: + return ret; +} + +int +mdc_inode_xatt_get(xlator_t *this, inode_t *inode, dict_t **dict) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) { + gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)", + uuid_utoa(inode->gfid)); + goto out; + } + + if (!is_md_cache_xatt_valid(this, mdc)) { + gf_msg_trace("md-cache", 0, "xattr cache not valid for (%s)", + uuid_utoa(inode->gfid)); + goto out; + } + + LOCK(&mdc->lock); + { + ret = 0; + /* Missing xattr only means no keys were there, i.e + a negative cache for the "loaded" keys + */ + if (!mdc->xattr) { + gf_msg_trace("md-cache", 0, "xattr not present (%s)", + uuid_utoa(inode->gfid)); + goto unlock; + } + + if (dict) + *dict = dict_ref(mdc->xattr); + } +unlock: + UNLOCK(&mdc->lock); + +out: + return ret; +} + +gf_boolean_t +mdc_inode_reset_need_lookup(xlator_t *this, inode_t *inode) +{ + struct md_cache *mdc = NULL; + gf_boolean_t need = _gf_false; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + + LOCK(&mdc->lock); + { + need = mdc->need_lookup; + mdc->need_lookup = _gf_false; + } + UNLOCK(&mdc->lock); + +out: + return need; +} + +void +mdc_inode_set_need_lookup(xlator_t *this, inode_t *inode, gf_boolean_t need) +{ + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + + LOCK(&mdc->lock); + { + mdc->need_lookup = need; + } + UNLOCK(&mdc->lock); + +out: + return; +} + +void +mdc_inode_iatt_invalidate(xlator_t *this, inode_t *inode) +{ + struct md_cache *mdc = NULL; + uint32_t gen = 0; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + + gen = mdc_inc_generation(this, inode) & 0xffffffff; + + LOCK(&mdc->lock); + { + mdc->ia_time = 0; + mdc->valid = _gf_false; + mdc->generation = gen; + } + UNLOCK(&mdc->lock); + +out: + return; +} + +int +mdc_inode_xatt_invalidate(xlator_t *this, inode_t *inode) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get(this, inode, &mdc) != 0) + goto out; + + LOCK(&mdc->lock); + { + mdc->xa_time = 0; + } + UNLOCK(&mdc->lock); + +out: + return ret; +} + +static int +mdc_update_gfid_stat(xlator_t *this, struct iatt *iatt) +{ + int ret = 0; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, iatt->ia_gfid); + if (!inode) { + ret = -1; + goto out; + } + ret = mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true, + mdc_inc_generation(this, inode)); +out: + return ret; +} + +static bool +mdc_load_reqs(xlator_t *this, dict_t *dict) +{ + struct mdc_conf *conf = this->private; + char *pattern = NULL; + char *mdc_xattr_str = NULL; + char *tmp = NULL; + char *tmp1 = NULL; + int ret = 0; + bool loaded = false; + + tmp1 = conf->mdc_xattr_str; + if (!tmp1) + goto out; + + mdc_xattr_str = gf_strdup(tmp1); + if (!mdc_xattr_str) + goto out; + + pattern = strtok_r(mdc_xattr_str, ",", &tmp); + while (pattern) { + gf_strTrim(&pattern); + ret = dict_set_int8(dict, pattern, 0); + if (ret) { + conf->mdc_xattr_str = NULL; + gf_msg("md-cache", GF_LOG_ERROR, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for xattrs, dict_set failed"); + goto out; + } + pattern = strtok_r(NULL, ",", &tmp); + } + + loaded = true; + +out: + GF_FREE(mdc_xattr_str); + + return loaded; +} + +struct checkpair { + int ret; + dict_t *rsp; +}; + +static int +checkfn(dict_t *this, char *key, data_t *value, void *data) +{ + struct checkpair *pair = data; + + if (!is_mdc_key_satisfied(THIS, key)) + pair->ret = 0; + + return 0; +} + +int +mdc_xattr_satisfied(xlator_t *this, dict_t *req, dict_t *rsp) +{ + struct checkpair pair = { + .ret = 1, + .rsp = rsp, + }; + + dict_foreach(req, checkfn, &pair); + + return pair.ret; +} + +static void +mdc_cache_statfs(xlator_t *this, struct statvfs *buf) +{ + struct mdc_conf *conf = this->private; + + pthread_mutex_lock(&conf->statfs_cache.lock); + { + memcpy(&conf->statfs_cache.buf, buf, sizeof(struct statvfs)); + conf->statfs_cache.last_refreshed = gf_time(); + } + pthread_mutex_unlock(&conf->statfs_cache.lock); +} + +int +mdc_load_statfs_info_from_cache(xlator_t *this, struct statvfs **buf) +{ + struct mdc_conf *conf = this->private; + uint32_t cache_age = 0; + int ret = 0; + + if (!buf || !conf) { + ret = -1; + goto err; + } + + *buf = NULL; + + pthread_mutex_lock(&conf->statfs_cache.lock); + { + /* Skip if the cache is not initialized. */ + if (conf->statfs_cache.last_refreshed == (time_t)-1) { + ret = -1; + goto unlock; + } + + cache_age = (gf_time() - conf->statfs_cache.last_refreshed); + + gf_log(this->name, GF_LOG_DEBUG, "STATFS cache age = %u secs", + cache_age); + if (cache_age > conf->timeout) { + /* Expire the cache. */ + gf_log(this->name, GF_LOG_DEBUG, + "Cache age %u secs exceeded timeout %u secs", cache_age, + conf->timeout); + ret = -1; + goto unlock; + } + + *buf = &conf->statfs_cache.buf; + } +unlock: + pthread_mutex_unlock(&conf->statfs_cache.lock); +err: + return ret; +} + +static dict_t * +mdc_prepare_request(xlator_t *this, mdc_local_t *local, dict_t *xdata) +{ + if (xdata != NULL) { + dict_ref(xdata); + } + + if (local == NULL) { + return xdata; + } + + if (xdata == NULL) { + xdata = dict_new(); + if (xdata == NULL) { + local->update_cache = false; + + return NULL; + } + } + + local->update_cache = mdc_load_reqs(this, xdata); + + return xdata; +} + +int +mdc_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + struct mdc_conf *conf = this->private; + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + } + + goto out; + } + + if (conf && conf->cache_statfs) { + mdc_cache_statfs(this, buf); + } + +out: + MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +int +mdc_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = 0, op_ret = 0, op_errno = 0; + struct statvfs *buf = NULL; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (!local) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + loc_copy(&local->loc, loc); + + if (!conf) { + goto uncached; + } + + if (!conf->cache_statfs) { + goto uncached; + } + + ret = mdc_load_statfs_info_from_cache(this, &buf); + if (ret == 0 && buf) { + op_ret = 0; + op_errno = 0; + goto out; + } + +uncached: + STACK_WIND(frame, mdc_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; + +out: + MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +mdc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *dict, struct iatt *postparent) +{ + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if (op_errno == ENOENT) + GF_ATOMIC_INC(conf->mdc_counter.negative_lookup); + + if (op_errno == ESTALE) { + /* if op_errno is ENOENT, fuse-bridge will unlink the + * dentry + */ + if (local->loc.parent) + mdc_inode_iatt_invalidate(this, local->loc.parent); + else + mdc_inode_iatt_invalidate(this, local->loc.inode); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, stbuf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, dict); + } + } +out: + MDC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, dict, + postparent); + return 0; +} + +int +mdc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = 0; + struct iatt stbuf = { + 0, + }; + struct iatt postparent = { + 0, + }; + dict_t *xattr_rsp = NULL; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (!local) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + loc_copy(&local->loc, loc); + + if (!inode_is_linked(loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + if (mdc_inode_reset_need_lookup(this, loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.need_lookup); + goto uncached; + } + + ret = mdc_inode_iatt_get(this, loc->inode, &stbuf); + if (ret != 0) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + if (xdata) { + ret = mdc_inode_xatt_get(this, loc->inode, &xattr_rsp); + if (ret != 0) { + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + goto uncached; + } + + if (!mdc_xattr_satisfied(this, xdata, xattr_rsp)) { + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + goto uncached; + } + } + + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &stbuf, xattr_rsp, + &postparent); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + return 0; + +uncached: + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + } + + goto out; + } + + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, xdata); + } + +out: + MDC_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +int +mdc_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (!local) + goto uncached; + + loc_copy(&local->loc, loc); + + if (!inode_is_linked(loc->inode)) { + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + goto uncached; + } + + ret = mdc_inode_iatt_get(this, loc->inode, &stbuf); + if (ret != 0) + goto uncached; + + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(stat, frame, 0, 0, &stbuf, xdata); + + return 0; + +uncached: + xdata = mdc_prepare_request(this, local, xdata); + + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + STACK_WIND(frame, mdc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->fd->inode); + } + + goto out; + } + + mdc_inode_iatt_set(this, local->fd->inode, buf, local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, local->fd->inode, xdata); + } + +out: + MDC_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); + + return 0; +} + +int +mdc_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret; + struct iatt stbuf; + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto uncached; + + local->fd = __fd_ref(fd); + + ret = mdc_inode_iatt_get(this, fd->inode, &stbuf); + if (ret != 0) + goto uncached; + + GF_ATOMIC_INC(conf->mdc_counter.stat_hit); + MDC_STACK_UNWIND(fstat, frame, 0, 0, &stbuf, xdata); + + return 0; + +uncached: + xdata = mdc_prepare_request(this, local, xdata); + + GF_ATOMIC_INC(conf->mdc_counter.stat_miss); + STACK_WIND(frame, mdc_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + + goto out; + } + + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + local->loc.inode = inode_ref(loc->inode); + } + + STACK_WIND(frame, mdc_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +int +mdc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int +mdc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int +mdc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } +out: + MDC_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +int +mdc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } + + STACK_WIND(frame, mdc_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +} + +int +mdc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } +out: + MDC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +int +mdc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } + + STACK_WIND(frame, mdc_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + return 0; +} + +int +mdc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + /* if errno is ESTALE, parent is not present, which implies even + * child is not present. Also, man 2 unlink states unlink can + * return ENOENT if a component in pathname does not + * exist or is a dangling symbolic link. So, invalidate both + * parent and child for both errno + */ + + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + } + +out: + MDC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int +mdc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } + + STACK_WIND(frame, mdc_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; +} + +int +mdc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + /* if errno is ESTALE, parent is not present, which implies even + * child is not present. Also, man 2 rmdir states rmdir can + * return ENOENT if a directory component in pathname does not + * exist or is a dangling symbolic link. So, invalidate both + * parent and child for both errno + */ + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + +out: + MDC_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; +} + +int +mdc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } + + STACK_WIND(frame, mdc_rmdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, loc, flag, xdata); + return 0; +} + +int +mdc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } +out: + MDC_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +int +mdc_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + mdc_local_t *local = NULL; + char *name; + + name = gf_strdup(linkname); + if (name == NULL) { + goto wind; + } + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + GF_FREE(name); + goto wind; + } + + loc_copy(&local->loc, loc); + local->linkname = name; + +wind: + STACK_WIND(frame, mdc_symlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata); + return 0; +} + +int +mdc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc2.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postoldparent, + local->incident_time); + } + + if (local->loc.inode) { + /* TODO: fix dht_rename() not to return linkfile + attributes before setting attributes here + */ + + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + } + + if (local->loc2.parent) { + mdc_inode_iatt_set(this, local->loc2.parent, postnewparent, + local->incident_time); + } +out: + MDC_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); + return 0; +} + +int +mdc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, oldloc->inode); + if (local != NULL) { + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + } + + STACK_WIND(frame, mdc_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; +} + +int +mdc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) { + mdc_inode_iatt_invalidate(this, local->loc.inode); + mdc_inode_iatt_invalidate(this, local->loc2.parent); + } + + goto out; + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time); + } + + if (local->loc2.parent) { + mdc_inode_iatt_set(this, local->loc2.parent, postparent, + local->incident_time); + } +out: + MDC_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +} + +int +mdc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, oldloc->inode); + if (local != NULL) { + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + } + + STACK_WIND(frame, mdc_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int +mdc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) { + mdc_inode_iatt_invalidate(this, local->loc.parent); + } + + goto out; + } + + if (local->loc.parent) { + mdc_inode_iatt_set(this, local->loc.parent, postparent, + local->incident_time); + } + + if (local->loc.inode) { + mdc_inode_iatt_set(this, inode, buf, local->incident_time); + } +out: + MDC_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf, preparent, + postparent, xdata); + return 0; +} + +int +mdc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xdata); + } + + STACK_WIND(frame, mdc_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +} + +static int +mdc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } + + if (local->fd->flags & O_TRUNC) { + /* O_TRUNC modifies file size. Hence invalidate the + * cache entry to fetch latest attributes. */ + mdc_inode_iatt_invalidate(this, local->fd->inode); + } + +out: + MDC_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +static int +mdc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + if (!fd || !IA_ISREG(fd->inode->ia_type) || !(fd->flags & O_TRUNC)) { + goto out; + } + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + +out: + STACK_WIND(frame, mdc_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int +mdc_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set(this, local->fd->inode, stbuf, local->incident_time); + +out: + MDC_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); + + return 0; +} + +int +mdc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} + +int +mdc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == -1) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} + +int +mdc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf, + _gf_true, local->incident_time); + mdc_inode_xatt_update(this, local->loc.inode, xdata); + +out: + MDC_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int valid, dict_t *xdata) +{ + mdc_local_t *local = NULL; + dict_t *xattr_alloc = NULL; + int ret = 0; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + goto wind; + } + + loc_copy(&local->loc, loc); + + if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0); + if (!ret) + ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->loc.inode); + } + } + + if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0); + if (!ret) + ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->loc.inode); + } + } + +wind: + STACK_WIND(frame, mdc_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + + if (xattr_alloc) + dict_unref(xattr_alloc); + return 0; +} + +int +mdc_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + mdc_inode_xatt_update(this, local->fd->inode, xdata); + +out: + MDC_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int valid, dict_t *xdata) +{ + mdc_local_t *local = NULL; + dict_t *xattr_alloc = NULL; + int ret = 0; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, fd->inode); + if (local == NULL) { + goto wind; + } + + local->fd = __fd_ref(fd); + + if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0); + if (!ret) + ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->fd->inode); + } + } + + if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) { + if (!xdata) + xdata = xattr_alloc = dict_new(); + if (xdata) { + ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0); + if (!ret) + ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + mdc_inode_xatt_invalidate(this, local->fd->inode); + } + } + +wind: + STACK_WIND(frame, mdc_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + + if (xattr_alloc) + dict_unref(xattr_alloc); + return 0; +} + +int +mdc_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; +} + +int +mdc_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } + + mdc_inode_xatt_update(this, local->loc.inode, local->xattr); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +mdc_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + local->xattr = dict_ref(xattr); + } + + STACK_WIND(frame, mdc_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata); + + return 0; +} + +int +mdc_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_xatt_update(this, local->fd->inode, local->xattr); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->fd->inode); + +out: + MDC_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +mdc_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + local->xattr = dict_ref(xattr); + } + + STACK_WIND(frame, mdc_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, xattr, flags, xdata); + + return 0; +} + +int +mdc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } + + if (dict_get(xattr, "glusterfs.skip-cache")) { + gf_msg(this->name, GF_LOG_DEBUG, 0, 0, + "Skipping xattr update due to empty value"); + goto out; + } + + if (local->update_cache) { + mdc_inode_xatt_set(this, local->loc.inode, xdata); + } + +out: + MDC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; +} + +int +mdc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +{ + int ret; + int op_errno = ENODATA; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + gf_boolean_t key_satisfied = _gf_false; + + local = mdc_local_get(frame, loc->inode); + if (!local) { + goto uncached; + } + + loc_copy(&local->loc, loc); + + if (!is_mdc_key_satisfied(this, key)) { + goto uncached; + } + key_satisfied = _gf_true; + + ret = mdc_inode_xatt_get(this, loc->inode, &xattr); + if (ret != 0) + goto uncached; + + if (!xattr || !dict_get(xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + MDC_STACK_UNWIND(getxattr, frame, ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + if (key_satisfied) { + xdata = mdc_prepare_request(this, local, xdata); + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + + if (key_satisfied && (xdata != NULL)) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret < 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + if (dict_get(xattr, "glusterfs.skip-cache")) { + gf_msg(this->name, GF_LOG_DEBUG, 0, 0, + "Skipping xattr update due to empty value"); + goto out; + } + + if (local->update_cache) { + mdc_inode_xatt_set(this, local->fd->inode, xdata); + } + +out: + MDC_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + + return 0; +} + +int +mdc_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + int ret; + mdc_local_t *local = NULL; + dict_t *xattr = NULL; + int op_errno = ENODATA; + struct mdc_conf *conf = this->private; + gf_boolean_t key_satisfied = _gf_true; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto uncached; + + local->fd = __fd_ref(fd); + + if (!is_mdc_key_satisfied(this, key)) { + key_satisfied = _gf_false; + goto uncached; + } + + ret = mdc_inode_xatt_get(this, fd->inode, &xattr); + if (ret != 0) + goto uncached; + + if (!xattr || !dict_get(xattr, (char *)key)) { + ret = -1; + op_errno = ENODATA; + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + MDC_STACK_UNWIND(fgetxattr, frame, ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + if (key_satisfied) { + xdata = mdc_prepare_request(this, local, xdata); + } + + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata); + + if (key_satisfied && (xdata != NULL)) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + goto out; + } + + if (local->key) + mdc_inode_xatt_unset(this, local->loc.inode, local->key); + else + mdc_inode_xatt_invalidate(this, local->loc.inode); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->loc.inode); +out: + MDC_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +mdc_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + int op_errno = ENODATA; + int ret = 0; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + char *name2; + + name2 = gf_strdup(name); + if (name2 == NULL) { + goto uncached; + } + + local = mdc_local_get(frame, loc->inode); + if (local == NULL) { + GF_FREE(name2); + goto uncached; + } + + loc_copy(&local->loc, loc); + local->key = name2; + + if (!is_mdc_key_satisfied(this, name)) + goto uncached; + + ret = mdc_inode_xatt_get(this, loc->inode, &xattr); + if (ret != 0) + goto uncached; + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + + if (!xattr || !dict_get(xattr, (char *)name)) { + ret = -1; + op_errno = ENODATA; + + MDC_STACK_UNWIND(removexattr, frame, ret, op_errno, xdata); + } else { + STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + } + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +} + +int +mdc_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct iatt prestat = { + 0, + }; + struct iatt poststat = { + 0, + }; + int ret = 0; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + if (local->key) + mdc_inode_xatt_unset(this, local->fd->inode, local->key); + else + mdc_inode_xatt_invalidate(this, local->fd->inode); + + ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat); + if (ret >= 0) { + ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat); + mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat, + _gf_true, local->incident_time); + } + + if (ret < 0) + mdc_inode_iatt_invalidate(this, local->fd->inode); + +out: + MDC_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int +mdc_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + int op_errno = ENODATA; + int ret = 0; + dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; + char *name2; + + name2 = gf_strdup(name); + if (name2 == NULL) { + goto uncached; + } + + local = mdc_local_get(frame, fd->inode); + if (local == NULL) { + GF_FREE(name2); + goto uncached; + } + + local->fd = __fd_ref(fd); + local->key = name2; + + if (!is_mdc_key_satisfied(this, name)) + goto uncached; + + ret = mdc_inode_xatt_get(this, fd->inode, &xattr); + if (ret != 0) + goto uncached; + + GF_ATOMIC_INC(conf->mdc_counter.xattr_hit); + + if (!xattr || !dict_get(xattr, (char *)name)) { + ret = -1; + op_errno = ENODATA; + + MDC_STACK_UNWIND(fremovexattr, frame, ret, op_errno, xdata); + } else { + STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + } + + if (xattr) + dict_unref(xattr); + + return 0; + +uncached: + GF_ATOMIC_INC(conf->mdc_counter.xattr_miss); + STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + return 0; +} + +int32_t +mdc_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +mdc_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (local != NULL) { + loc_copy(&local->loc, loc); + } + + /* Tell readdir-ahead to include these keys in xdata when it + * internally issues readdirp() in it's opendir_cbk */ + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +} + +int +mdc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto unwind; + + if (op_ret <= 0) { + if ((op_ret == -1) && ((op_errno == ENOENT) || (op_errno == ESTALE))) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto unwind; + } + + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode) + continue; + mdc_inode_iatt_set(this, entry->inode, &entry->d_stat, + local->incident_time); + if (local->update_cache) { + mdc_inode_xatt_set(this, entry->inode, entry->dict); + } + } + +unwind: + MDC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +int +mdc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto out; + + local->fd = __fd_ref(fd); + + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +out: + MDC_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +int +mdc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); +out: + MDC_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +int +mdc_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + mdc_local_t *local = NULL; + struct mdc_conf *conf = this->private; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto unwind; + + local->fd = __fd_ref(fd); + + if (!conf->force_readdirp) { + STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + return 0; + } + + xdata = mdc_prepare_request(this, local, xdata); + + STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + + if (xdata != NULL) { + dict_unref(xdata); + } + + return 0; +unwind: + MDC_STACK_UNWIND(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +int +mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int +mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + + return 0; +} + +int +mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + + return 0; +} + +int +mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret != 0) { + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + goto out; + } + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf, + _gf_true, local->incident_time); + +out: + MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; +} + +int +mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame, fd->inode); + if (local != NULL) { + local->fd = __fd_ref(fd); + } + + STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + + return 0; +} + +int32_t +mdc_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *path, + struct iatt *buf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ENOENT) || (op_errno == ESTALE)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, buf, xdata); + return 0; +} + +int32_t +mdc_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (!local) + goto unwind; + + loc_copy(&local->loc, loc); + + STACK_WIND(frame, mdc_readlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, loc, size, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(readlink, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int32_t +mdc_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->fd->inode); + +out: + MDC_STACK_UNWIND(fsyncdir, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +mdc_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, fd->inode); + if (!local) + goto unwind; + + local->fd = __fd_ref(fd); + + STACK_WIND(frame, mdc_fsyncdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(fsyncdir, frame, -1, ENOMEM, NULL); + return 0; +} + +int32_t +mdc_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + if (!local) + goto out; + + if (op_ret == 0) + goto out; + + if ((op_errno == ESTALE) || (op_errno == ENOENT)) + mdc_inode_iatt_invalidate(this, local->loc.inode); + +out: + MDC_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +mdc_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get(frame, loc->inode); + if (!local) + goto unwind; + + loc_copy(&local->loc, loc); + + STACK_WIND(frame, mdc_access_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, loc, mask, xdata); + return 0; + +unwind: + MDC_STACK_UNWIND(access, frame, -1, ENOMEM, NULL); + return 0; +} + +int +mdc_priv_dump(xlator_t *this) +{ + struct mdc_conf *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("stat_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_hit)); + gf_proc_dump_write("stat_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_miss)); + gf_proc_dump_write("xattr_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_hit)); + gf_proc_dump_write("xattr_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_miss)); + gf_proc_dump_write("nameless_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup)); + gf_proc_dump_write("negative_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.negative_lookup)); + gf_proc_dump_write("stat_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.stat_invals)); + gf_proc_dump_write("xattr_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->mdc_counter.xattr_invals)); + + return 0; +} + +static int32_t +mdc_dump_metrics(xlator_t *this, int fd) +{ + struct mdc_conf *conf = NULL; + + conf = this->private; + if (!conf) + goto out; + + dprintf(fd, "%s.stat_cache_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.stat_hit)); + dprintf(fd, "%s.stat_cache_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.stat_miss)); + dprintf(fd, "%s.xattr_cache_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.xattr_hit)); + dprintf(fd, "%s.xattr_cache_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.xattr_miss)); + dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup)); + dprintf(fd, "%s.negative_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->mdc_counter.negative_lookup)); + dprintf(fd, "%s.stat_cache_invalidations_received %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->mdc_counter.stat_invals)); + dprintf(fd, "%s.xattr_cache_invalidations_received %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->mdc_counter.xattr_invals)); +out: + return 0; +} + +int +mdc_forget(xlator_t *this, inode_t *inode) +{ + mdc_inode_wipe(this, inode); + + return 0; +} + +int +is_strpfx(const char *str1, const char *str2) +{ + /* is one of the string a prefix of the other? */ + int i; + + for (i = 0; str1[i] == str2[i]; i++) { + if (!str1[i] || !str2[i]) + break; + } + + return !(str1[i] && str2[i]); +} + +static int +mdc_key_unload_all(struct mdc_conf *conf) +{ + conf->mdc_xattr_str = NULL; + + return 0; +} + +int +mdc_xattr_list_populate(struct mdc_conf *conf, char *tmp_str) +{ + char *mdc_xattr_str = NULL; + size_t max_size = 0; + int ret = 0; + + max_size = SLEN( + "security.capability,security.selinux,security." + "ima," POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR + "," GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT + "," + "user.swift.metadata,user.DOSATTRIB,user.DosStream.*" + ",user.org.netatalk.Metadata,security.NTACL," + "user.org.netatalk.ResourceFork") + + strlen(tmp_str) + 5; /*Some buffer bytes*/ + + mdc_xattr_str = GF_MALLOC(max_size, gf_common_mt_char); + GF_CHECK_ALLOC(mdc_xattr_str, ret, out); + mdc_xattr_str[0] = '\0'; + + if (conf->cache_capability) + strcat(mdc_xattr_str, "security.capability,"); + + if (conf->cache_selinux) + strcat(mdc_xattr_str, "security.selinux,"); + + if (conf->cache_ima) + strcat(mdc_xattr_str, "security.ima,"); + + if (conf->cache_posix_acl) + strcat(mdc_xattr_str, + POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR ","); + + if (conf->cache_glusterfs_acl) + strcat(mdc_xattr_str, GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT ","); + + if (conf->cache_swift_metadata) + strcat(mdc_xattr_str, "user.swift.metadata,"); + + if (conf->cache_samba_metadata) + strcat(mdc_xattr_str, + "user.DOSATTRIB,user.DosStream.*," + "user.org.netatalk.Metadata,user.org.netatalk." + "ResourceFork,security.NTACL,"); + + strcat(mdc_xattr_str, tmp_str); + + LOCK(&conf->lock); + { + /* This is not freed, else is_mdc_key_satisfied, which is + * called by every fop has to take lock, and will lead to + * lock contention + */ + conf->mdc_xattr_str = mdc_xattr_str; + } + UNLOCK(&conf->lock); + +out: + return ret; +} + +struct set { + inode_t *inode; + xlator_t *this; +}; + +static int +mdc_inval_xatt(dict_t *d, char *k, data_t *v, void *tmp) +{ + struct set *tmp1 = NULL; + int ret = 0; + + tmp1 = (struct set *)tmp; + ret = mdc_inode_xatt_unset(tmp1->this, tmp1->inode, k); + return ret; +} + +static int +mdc_invalidate(xlator_t *this, void *data) +{ + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + int ret = 0; + struct set tmp = { + 0, + }; + inode_table_t *itable = NULL; + struct mdc_conf *conf = this->private; + uint64_t gen = 0; + + up_data = (struct gf_upcall *)data; + + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; + } + + if (up_ci->flags & UP_PARENT_DENTRY_FLAGS) { + mdc_update_gfid_stat(this, &up_ci->p_stat); + if (up_ci->flags & UP_RENAME_FLAGS) + mdc_update_gfid_stat(this, &up_ci->oldp_stat); + } + + if (up_ci->flags & UP_EXPLICIT_LOOKUP) { + mdc_inode_set_need_lookup(this, inode, _gf_true); + goto out; + } + + if (up_ci->flags & + (UP_NLINK | UP_RENAME_FLAGS | UP_FORGET | UP_INVAL_ATTR)) { + mdc_inode_iatt_invalidate(this, inode); + mdc_inode_xatt_invalidate(this, inode); + GF_ATOMIC_INC(conf->mdc_counter.stat_invals); + goto out; + } + + if (up_ci->flags & IATT_UPDATE_FLAGS) { + gen = mdc_inc_generation(this, inode); + ret = mdc_inode_iatt_set_validate(this, inode, NULL, &up_ci->stat, + _gf_false, gen); + /* one of the scenarios where ret < 0 is when this invalidate + * is older than the current stat, in that case do not + * update the xattrs as well + */ + if (ret < 0) + goto out; + GF_ATOMIC_INC(conf->mdc_counter.stat_invals); + } + + if (up_ci->flags & UP_XATTR) { + if (up_ci->dict) + ret = mdc_inode_xatt_update(this, inode, up_ci->dict); + else + ret = mdc_inode_xatt_invalidate(this, inode); + + GF_ATOMIC_INC(conf->mdc_counter.xattr_invals); + } else if (up_ci->flags & UP_XATTR_RM) { + tmp.inode = inode; + tmp.this = this; + ret = dict_foreach(up_ci->dict, mdc_inval_xatt, &tmp); + + GF_ATOMIC_INC(conf->mdc_counter.xattr_invals); + } + +out: + if (inode) + inode_unref(inode); + + return ret; +} + +struct mdc_ipc { + xlator_t *this; + dict_t *xattr; +}; + +static int +mdc_send_xattrs_cbk(int ret, call_frame_t *frame, void *data) +{ + struct mdc_ipc *tmp = data; + + if (ret < 0) { + mdc_key_unload_all(THIS->private); + gf_msg("md-cache", GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for all xattrs, as registering for " + "xattr cache invalidation failed"); + } + STACK_DESTROY(frame->root); + dict_unref(tmp->xattr); + GF_FREE(tmp); + + return 0; +} + +static int +mdc_send_xattrs(void *data) +{ + int ret = 0; + struct mdc_ipc *tmp = data; + + ret = syncop_ipc(FIRST_CHILD(tmp->this), GF_IPC_TARGET_UPCALL, tmp->xattr, + NULL); + DECODE_SYNCOP_ERR(ret); + if (ret < 0) { + gf_msg(tmp->this->name, GF_LOG_WARNING, errno, + MD_CACHE_MSG_IPC_UPCALL_FAILED, + "Registering the list " + "of xattrs that needs invalidaton, with upcall, failed"); + } + + return ret; +} + +static int +mdc_register_xattr_inval(xlator_t *this) +{ + dict_t *xattr = NULL; + int ret = 0; + struct mdc_conf *conf = NULL; + call_frame_t *frame = NULL; + struct mdc_ipc *data = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + if (!conf->mdc_invalidation) { + UNLOCK(&conf->lock); + goto out; + } + } + UNLOCK(&conf->lock); + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "dict_new failed"); + ret = -1; + goto out; + } + + if (!mdc_load_reqs(this, xattr)) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to populate cache entries"); + ret = -1; + goto out; + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to create the frame"); + ret = -1; + goto out; + } + + data = GF_CALLOC(1, sizeof(struct mdc_ipc), gf_mdc_mt_mdc_ipc); + if (!data) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "failed to allocate memory"); + ret = -1; + goto out; + } + + data->this = this; + data->xattr = xattr; + ret = synctask_new(this->ctx->env, mdc_send_xattrs, mdc_send_xattrs_cbk, + frame, data); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, + MD_CACHE_MSG_IPC_UPCALL_FAILED, + "Registering the list " + "of xattrs that needs invalidaton, with upcall, failed"); + } + +out: + if (ret < 0) { + mdc_key_unload_all(conf); + if (xattr) + dict_unref(xattr); + if (frame) + STACK_DESTROY(frame->root); + GF_FREE(data); + gf_msg(this->name, GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE, + "Disabled cache for all xattrs, as registering for " + "xattr cache invalidation failed"); + } + + return ret; +} + +int +mdc_reconfigure(xlator_t *this, dict_t *options) +{ + struct mdc_conf *conf = NULL; + int timeout = 0, ret = 0; + char *tmp_str = NULL; + + conf = this->private; + + GF_OPTION_RECONF("md-cache-timeout", timeout, options, int32, out); + + GF_OPTION_RECONF("cache-selinux", conf->cache_selinux, options, bool, out); + + GF_OPTION_RECONF("cache-capability-xattrs", conf->cache_capability, options, + bool, out); + + GF_OPTION_RECONF("cache-ima-xattrs", conf->cache_ima, options, bool, out); + + GF_OPTION_RECONF("cache-posix-acl", conf->cache_posix_acl, options, bool, + out); + + GF_OPTION_RECONF("cache-glusterfs-acl", conf->cache_glusterfs_acl, options, + bool, out); + + GF_OPTION_RECONF("cache-swift-metadata", conf->cache_swift_metadata, + options, bool, out); + + GF_OPTION_RECONF("cache-samba-metadata", conf->cache_samba_metadata, + options, bool, out); + + GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, + out); + + GF_OPTION_RECONF("cache-invalidation", conf->mdc_invalidation, options, + bool, out); + + GF_OPTION_RECONF("global-cache-invalidation", conf->global_invalidation, + options, bool, out); + + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + + GF_OPTION_RECONF("md-cache-statfs", conf->cache_statfs, options, bool, out); + + GF_OPTION_RECONF("xattr-cache-list", tmp_str, options, str, out); + + ret = mdc_xattr_list_populate(conf, tmp_str); + if (ret < 0) + goto out; + + /* If timeout is greater than 60s (default before the patch that added + * cache invalidation support was added) then, cache invalidation + * feature for md-cache needs to be enabled, if not set timeout to the + * previous max which is 60s + */ + if ((timeout > 60) && (!conf->mdc_invalidation)) { + conf->timeout = 60; + goto out; + } + conf->timeout = timeout; + + ret = mdc_register_xattr_inval(this); +out: + return ret; +} + +int32_t +mdc_mem_acct_init(xlator_t *this) +{ + return xlator_mem_acct_init(this, gf_mdc_mt_end + 1); +} + +int +mdc_init(xlator_t *this) +{ + struct mdc_conf *conf = NULL; + uint32_t timeout = 0; + char *tmp_str = NULL; + + conf = GF_CALLOC(sizeof(*conf), 1, gf_mdc_mt_mdc_conf_t); + if (!conf) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY, + "out of memory"); + return -1; + } + + LOCK_INIT(&conf->lock); + + GF_OPTION_INIT("md-cache-timeout", timeout, uint32, out); + + GF_OPTION_INIT("cache-selinux", conf->cache_selinux, bool, out); + + GF_OPTION_INIT("cache-capability-xattrs", conf->cache_capability, bool, + out); + + GF_OPTION_INIT("cache-ima-xattrs", conf->cache_ima, bool, out); + + GF_OPTION_INIT("cache-posix-acl", conf->cache_posix_acl, bool, out); + + GF_OPTION_INIT("cache-glusterfs-acl", conf->cache_glusterfs_acl, bool, out); + + GF_OPTION_INIT("cache-swift-metadata", conf->cache_swift_metadata, bool, + out); + + GF_OPTION_INIT("cache-samba-metadata", conf->cache_samba_metadata, bool, + out); + + GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); + + GF_OPTION_INIT("cache-invalidation", conf->mdc_invalidation, bool, out); + + GF_OPTION_INIT("global-cache-invalidation", conf->global_invalidation, bool, + out); + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + pthread_mutex_init(&conf->statfs_cache.lock, NULL); + GF_OPTION_INIT("md-cache-statfs", conf->cache_statfs, bool, out); + + GF_OPTION_INIT("xattr-cache-list", tmp_str, str, out); + mdc_xattr_list_populate(conf, tmp_str); + + conf->last_child_down = gf_time(); + conf->statfs_cache.last_refreshed = (time_t)-1; + + /* initialize gf_atomic_t counters */ + GF_ATOMIC_INIT(conf->mdc_counter.stat_hit, 0); + GF_ATOMIC_INIT(conf->mdc_counter.stat_miss, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_hit, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_miss, 0); + GF_ATOMIC_INIT(conf->mdc_counter.negative_lookup, 0); + GF_ATOMIC_INIT(conf->mdc_counter.nameless_lookup, 0); + GF_ATOMIC_INIT(conf->mdc_counter.stat_invals, 0); + GF_ATOMIC_INIT(conf->mdc_counter.xattr_invals, 0); + GF_ATOMIC_INIT(conf->mdc_counter.need_lookup, 0); + GF_ATOMIC_INIT(conf->generation, 0); + + /* If timeout is greater than 60s (default before the patch that added + * cache invalidation support was added) then, cache invalidation + * feature for md-cache needs to be enabled, if not set timeout to the + * previous max which is 60s + */ + if ((timeout > 60) && (!conf->mdc_invalidation)) { + conf->timeout = 60; + goto out; + } + conf->timeout = timeout; + +out: + this->private = conf; + + return 0; +} + +void +mdc_update_child_down_time(xlator_t *this, time_t now) +{ + struct mdc_conf *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->last_child_down = now; + } + UNLOCK(&conf->lock); +} + +int +mdc_notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + struct mdc_conf *conf = NULL; + + conf = this->private; + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + mdc_update_child_down_time(this, gf_time()); + break; + case GF_EVENT_UPCALL: + if (conf->mdc_invalidation) + ret = mdc_invalidate(this, data); + break; + case GF_EVENT_CHILD_UP: + case GF_EVENT_SOME_DESCENDENT_UP: + ret = mdc_register_xattr_inval(this); + break; + default: + break; + } + + if (default_notify(this, event, data) != 0) + ret = -1; + + return ret; +} + +void +mdc_fini(xlator_t *this) +{ + GF_FREE(this->private); +} + +struct xlator_fops mdc_fops = { + .lookup = mdc_lookup, + .stat = mdc_stat, + .fstat = mdc_fstat, + .truncate = mdc_truncate, + .ftruncate = mdc_ftruncate, + .mknod = mdc_mknod, + .mkdir = mdc_mkdir, + .unlink = mdc_unlink, + .rmdir = mdc_rmdir, + .symlink = mdc_symlink, + .rename = mdc_rename, + .link = mdc_link, + .create = mdc_create, + .open = mdc_open, + .readv = mdc_readv, + .writev = mdc_writev, + .setattr = mdc_setattr, + .fsetattr = mdc_fsetattr, + .fsync = mdc_fsync, + .setxattr = mdc_setxattr, + .fsetxattr = mdc_fsetxattr, + .getxattr = mdc_getxattr, + .fgetxattr = mdc_fgetxattr, + .removexattr = mdc_removexattr, + .fremovexattr = mdc_fremovexattr, + .opendir = mdc_opendir, + .readdirp = mdc_readdirp, + .readdir = mdc_readdir, + .fallocate = mdc_fallocate, + .discard = mdc_discard, + .zerofill = mdc_zerofill, + .statfs = mdc_statfs, + .readlink = mdc_readlink, + .fsyncdir = mdc_fsyncdir, + .access = mdc_access, +}; + +struct xlator_cbks mdc_cbks = { + .forget = mdc_forget, +}; + +struct xlator_dumpops mdc_dumpops = { + .priv = mdc_priv_dump, +}; + +struct volume_options mdc_options[] = { + { + .key = {"md-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable md-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"cache-selinux"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache selinux xattr(security.selinux) on client side", + }, + { + .key = {"cache-capability-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache capability xattr(security.capability) on " + "client side", + }, + { + .key = {"cache-ima-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache Linux integrity subsystem xattr(security.ima) " + "on client side", + }, + { + .key = {"cache-swift-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_7_10}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache swift metadata (user.swift.metadata xattr)", + }, + { + .key = {"cache-samba-metadata"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache samba metadata (user.DOSATTRIB, security.NTACL," + " org.netatalk.Metadata, org.netatalk.ResourceFork, " + "and user.DosStream. xattrs)", + }, + { + .key = {"cache-posix-acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache posix ACL xattrs (system.posix_acl_access, " + "system.posix_acl_default) on client side", + }, + { + .key = {"cache-glusterfs-acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache virtual glusterfs ACL xattrs " + "(glusterfs.posix.acl, glusterfs.posix.default_acl) " + "on client side", + }, + { + .key = {"md-cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 600, + .default_value = SITE_H_MD_CACHE_TIMEOUT, + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Time period after which cache has to be refreshed", + }, + { + .key = {"force-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Convert all readdir requests to readdirplus to " + "collect stat info on each entry.", + }, + { + .key = {"cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_9_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "When \"on\", invalidates/updates the metadata cache," + " on receiving the cache-invalidation notifications", + }, + { + .key = {"global-cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = + "When \"on\", purges all read caches in kernel and glusterfs stack " + "whenever a stat change is detected. Stat changes can be detected " + "while processing responses to file operations (fop) or through " + "upcall notifications. Since purging caches can be an expensive " + "operation, it's advised to have this option \"on\" only when a " + "file " + "can be accessed from multiple different Glusterfs mounts and " + "caches across these different mounts are required to be coherent. " + "If a file is not accessed across different mounts " + "(simple example is having only one mount for a volume), its " + "advised to keep " + "this option \"off\" as all file modifications go through caches " + "keeping them " + "coherent. This option overrides value of " + "performance.cache-invalidation.", + }, + { + .key = {"md-cache-statfs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache statfs information of filesystem on the client", + }, + { + .key = {"xattr-cache-list"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "A comma separated list of xattrs that shall be " + "cached by md-cache. The only wildcard allowed is '*'", + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"md-cache"}, + .description = "Enable/Disable md cache translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = mdc_init, + .fini = mdc_fini, + .notify = mdc_notify, + .reconfigure = mdc_reconfigure, + .mem_acct_init = mdc_mem_acct_init, + .dump_metrics = mdc_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &mdc_dumpops, + .fops = &mdc_fops, + .cbks = &mdc_cbks, + .options = mdc_options, + .identifier = "md-cache", + .category = GF_MAINTAINED, +}; diff --git a/xlators/performance/symlink-cache/Makefile.am b/xlators/performance/nl-cache/Makefile.am index d471a3f9243..a985f42a877 100644 --- a/xlators/performance/symlink-cache/Makefile.am +++ b/xlators/performance/nl-cache/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = src -CLEANFILES = +CLEANFILES = diff --git a/xlators/performance/nl-cache/src/Makefile.am b/xlators/performance/nl-cache/src/Makefile.am new file mode 100644 index 00000000000..c44ce871627 --- /dev/null +++ b/xlators/performance/nl-cache/src/Makefile.am @@ -0,0 +1,12 @@ +xlator_LTLIBRARIES = nl-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance +nl_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +nl_cache_la_SOURCES = nl-cache.c nl-cache-helper.c +nl_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +noinst_HEADERS = nl-cache.h nl-cache-mem-types.h nl-cache-messages.h +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(CONTRIBDIR)/timer-wheel + +AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) +CLEANFILES = diff --git a/xlators/performance/nl-cache/src/nl-cache-helper.c b/xlators/performance/nl-cache/src/nl-cache-helper.c new file mode 100644 index 00000000000..29b99b5b8ea --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-helper.c @@ -0,0 +1,1201 @@ +/* + * Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#include "nl-cache.h" +#include "timer-wheel.h" +#include <glusterfs/statedump.h> + +/* Caching guidelines: + * This xlator serves negative lookup(ENOENT lookups) from the cache, + * there by making create faster. + * What is cached? + * Negative lookup cache is stored for each directory, and has 2 entries: + * - Negative entries: Populated only when lookup/stat returns ENOENT. + * Fuse mostly sends only one lookup before create, hence negative entry + * cache is almost useless. But for SMB access, multiple lookups/stats + * are sent before creating the file. Hence the negative entry cache. + * It can exist even when the positive entry cache is invalid. It also + * has the entries that were deleted from this directory. + * Freed on receiving upcall(with dentry change flag) or on expiring + * timeout of the cache. + * + * - Positive entries: Populated as a part of readdirp, and as a part of + * mkdir followed by creates inside that directory. Lookups and other + * fops do not populate the positive entry (as it can grow long and is + * of no value add) + * Freed on receiving upcall(with dentry change flag) or on expiring + * timeout of the cache. + * + * Data structures to store cache? + * The cache of any directory is stored in the inode_ctx of the directory. + * Negative entries are stored as list of strings. + * Search - O(n) + * Add - O(1) + * Delete - O(n) - as it has to be searched before deleting + * Positive entries are stored as a list, each list node has a pointer + * to the inode of the positive entry or the name of the entry. + * Since the client side inode table already will have inodes for + * positive entries, we just take a ref of that inode and store as + * positive entry cache. In cases like hardlinks and readdirp where + * inode is NULL, we store the names. + * Name Search - O(n) + * Inode Search - O(1) - Actually complexity of inode_find() + * Name/inode Add - O(1) + * Name Delete - O(n) + * Inode Delete - O(1) + * + * Locking order: + * + * TODO: + * - Fill Positive entries on readdir/p, after which in lookup_cbk check if the + * name is in PE and replace it with inode. + * - fini, PARENET_DOWN, disable caching + * - Virtual setxattr to dump the inode_ctx, to ease debugging + * - Handle dht_nuke xattr: clear all cache + * - Special handling for .meta and .trashcan? + */ + +int +__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx); +int +__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx); +void +nlc_remove_from_lru(xlator_t *this, inode_t *inode); +void +__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx); +gf_boolean_t +__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name); +void +__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe); +void +__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne); + +static int32_t +nlc_get_cache_timeout(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + /* Cache timeout is generally not meant to be changed often, + * once set, hence not within locks */ + return conf->cache_timeout; +} + +static gf_boolean_t +__nlc_is_cache_valid(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + time_t last_val_time; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO(this->name, nlc_ctx, out); + + conf = this->private; + + LOCK(&conf->lock); + { + last_val_time = conf->last_child_down; + } + UNLOCK(&conf->lock); + + if ((last_val_time <= nlc_ctx->cache_time) && (nlc_ctx->cache_time != 0)) + ret = _gf_true; +out: + return ret; +} + +void +nlc_update_child_down_time(xlator_t *this, time_t now) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->last_child_down = now; + } + UNLOCK(&conf->lock); + + return; +} + +void +nlc_disable_cache(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + conf->disable_cache = _gf_true; + } + UNLOCK(&conf->lock); + + return; +} + +static int +__nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + int ret = 0; + nlc_ctx_t *nlc_ctx = NULL; + uint64_t nlc_ctx_int = 0; + uint64_t nlc_pe_int = 0; + + ret = __inode_ctx_get2(inode, this, &nlc_ctx_int, &nlc_pe_int); + if (ret == 0 && nlc_ctx_p) { + nlc_ctx = (void *)(long)(nlc_ctx_int); + *nlc_ctx_p = nlc_ctx; + } + return ret; +} + +static int +nlc_inode_ctx_set(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx, + nlc_pe_t *nlc_pe_p) +{ + uint64_t ctx1, ctx2; + int ret = -1; + + ctx1 = (uint64_t)(uintptr_t)nlc_ctx; + ctx2 = (uint64_t)(uintptr_t)nlc_pe_p; + + /* The caller may choose to set one of the ctxs, hence check + * if the ctx1/2 is non zero and then send the address. If we + * blindly send the address of both the ctxs, it may reset the + * ctx the caller had sent NULL(intended as leave untouched) for.*/ + LOCK(&inode->lock); + { + ret = __inode_ctx_set2(inode, this, ctx1 ? &ctx1 : 0, ctx2 ? &ctx2 : 0); + } + UNLOCK(&inode->lock); + return ret; +} + +static void +nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __nlc_inode_ctx_get(this, inode, nlc_ctx_p); + if (ret < 0) + gf_msg_debug(this->name, 0, + "inode ctx get failed for " + "inode:%p", + inode); + } + UNLOCK(&inode->lock); + + return; +} + +static void +__nlc_inode_clear_entries(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp1 = NULL; + + if (!nlc_ctx) + goto out; + + if (IS_PE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + __nlc_free_pe(this, nlc_ctx, pe); + } + + if (IS_NE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list) + { + __nlc_free_ne(this, nlc_ctx, ne); + } + + nlc_ctx->cache_time = 0; + nlc_ctx->state = 0; + GF_ASSERT(nlc_ctx->cache_size == sizeof(*nlc_ctx)); + GF_ASSERT(nlc_ctx->refd_inodes == 0); +out: + return; +} + +static void +nlc_init_invalid_ctx(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + /* The cache/nlc_ctx can be invalid for 2 reasons: + * - Because of a child-down/timer expiry, cache is + * invalid but the nlc_ctx is not yet cleaned up. + * - nlc_ctx is cleaned up, because of invalidations + * or lru prune etc.*/ + + /* If the cache is present but invalid, clear the cache and + * reset the timer. */ + __nlc_inode_clear_entries(this, nlc_ctx); + + /* If timer is present, then it is already part of lru as well + * Hence reset the timer and return.*/ + if (nlc_ctx->timer) { + gf_tw_mod_timer_pending(conf->timer_wheel, nlc_ctx->timer, + conf->cache_timeout); + nlc_ctx->cache_time = gf_time(); + goto unlock; + } + + /* If timer was NULL, the nlc_ctx is already cleanedup, + * and we need to start timer and add to lru, so that it is + * ready to cache entries a fresh */ + ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx); + if (ret < 0) + goto unlock; + + ret = __nlc_add_to_lru(this, inode, nlc_ctx); + if (ret < 0) { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +static nlc_ctx_t * +nlc_inode_ctx_get_set(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p) +{ + uint64_t ctx; + int ret = 0; + nlc_ctx_t *nlc_ctx = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&inode->lock); + { + ret = __nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (nlc_ctx) + goto unlock; + + nlc_ctx = GF_CALLOC(sizeof(*nlc_ctx), 1, gf_nlc_mt_nlc_ctx_t); + if (!nlc_ctx) + goto unlock; + + LOCK_INIT(&nlc_ctx->lock); + INIT_LIST_HEAD(&nlc_ctx->pe); + INIT_LIST_HEAD(&nlc_ctx->ne); + + ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx); + if (ret < 0) + goto unlock; + + ret = __nlc_add_to_lru(this, inode, nlc_ctx); + if (ret < 0) { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + goto unlock; + } + + ctx = (uint64_t)(uintptr_t)nlc_ctx; + ret = __inode_ctx_set2(inode, this, &ctx, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NLC_MSG_NO_MEMORY, + "inode ctx set failed"); + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + nlc_remove_from_lru(this, inode); + goto unlock; + } + + /*TODO: also sizeof (gf_tw_timer_list) + nlc_timer_data_t ?*/ + nlc_ctx->cache_size = sizeof(*nlc_ctx); + GF_ATOMIC_ADD(conf->current_cache_size, nlc_ctx->cache_size); + } +unlock: + UNLOCK(&inode->lock); + + if (ret == 0 && nlc_ctx_p) { + *nlc_ctx_p = nlc_ctx; + nlc_init_invalid_ctx(this, inode, nlc_ctx); + } + + if (ret < 0 && nlc_ctx) { + LOCK_DESTROY(&nlc_ctx->lock); + GF_FREE(nlc_ctx); + nlc_ctx = NULL; + goto out; + } + +out: + return nlc_ctx; +} + +nlc_local_t * +nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, loc_t *loc2) +{ + nlc_local_t *local = NULL; + + local = GF_CALLOC(sizeof(*local), 1, gf_nlc_mt_nlc_local_t); + if (!local) + goto out; + + if (loc) + loc_copy(&local->loc, loc); + if (loc2) + loc_copy(&local->loc2, loc2); + + local->fop = fop; + frame->local = local; +out: + return local; +} + +void +nlc_local_wipe(xlator_t *this, nlc_local_t *local) +{ + if (!local) + goto out; + + loc_wipe(&local->loc); + + loc_wipe(&local->loc2); + + GF_FREE(local); +out: + return; +} + +static void +__nlc_set_dir_state(nlc_ctx_t *nlc_ctx, uint64_t new_state) +{ + nlc_ctx->state |= new_state; + + return; +} + +void +nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_set_dir_state(nlc_ctx, state); + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +static void +nlc_cache_timeout_handler(struct gf_tw_timer_list *timer, void *data, + unsigned long calltime) +{ + nlc_timer_data_t *tmp = data; + nlc_ctx_t *nlc_ctx = NULL; + + nlc_inode_ctx_get(tmp->this, tmp->inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + /* Taking nlc_ctx->lock will lead to deadlock, hence updating + * the cache is invalid outside of lock, instead of clear_cache. + * Since cache_time is assigned outside of lock, the value can + * be invalid for short time, this may result in false negative + * which is better than deadlock */ + nlc_ctx->cache_time = 0; +out: + return; +} + +void +__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + if (nlc_ctx->timer) + gf_tw_del_timer(conf->timer_wheel, nlc_ctx->timer); + + if (nlc_ctx->timer_data) { + inode_unref(nlc_ctx->timer_data->inode); + GF_FREE(nlc_ctx->timer_data); + nlc_ctx->timer_data = NULL; + } + + GF_FREE(nlc_ctx->timer); + nlc_ctx->timer = NULL; + + return; +} + +int +__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + struct gf_tw_timer_list *timer = NULL; + nlc_timer_data_t *tmp = NULL; + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + /* We are taking inode_table->lock within inode->lock + * as the only other caller which takes inode->lock within + * inode_table->lock and cause deadlock is inode_table_destroy. + * Hopefully, there can be no fop when inode_table_destroy is + * being called. */ + tmp = GF_CALLOC(1, sizeof(*tmp), gf_nlc_mt_nlc_timer_data_t); + if (!tmp) + goto out; + tmp->inode = inode_ref(inode); + tmp->this = this; + + timer = GF_CALLOC(1, sizeof(*timer), gf_common_mt_tw_timer_list); + if (!timer) + goto out; + + INIT_LIST_HEAD(&timer->entry); + timer->expires = nlc_get_cache_timeout(this); + timer->function = nlc_cache_timeout_handler; + timer->data = tmp; + nlc_ctx->timer = timer; + nlc_ctx->timer_data = tmp; + gf_tw_add_timer(conf->timer_wheel, timer); + + nlc_ctx->cache_time = gf_time(); + gf_msg_trace(this->name, 0, + "Registering timer:%p, inode:%p, " + "gfid:%s", + timer, inode, uuid_utoa(inode->gfid)); + + ret = 0; + +out: + if (ret < 0) { + if (tmp && tmp->inode) + inode_unref(tmp->inode); + GF_FREE(tmp); + GF_FREE(timer); + } + + return ret; +} + +int +__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx) +{ + nlc_lru_node_t *lru_ino = NULL; + uint64_t nlc_pe_int = 0; + nlc_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + lru_ino = GF_CALLOC(1, sizeof(*lru_ino), gf_nlc_mt_nlc_lru_node); + if (!lru_ino) + goto out; + + INIT_LIST_HEAD(&lru_ino->list); + lru_ino->inode = inode_ref(inode); + LOCK(&conf->lock); + { + list_add_tail(&lru_ino->list, &conf->lru); + } + UNLOCK(&conf->lock); + + nlc_ctx->refd_inodes = 0; + ret = __inode_ctx_get2(inode, this, NULL, &nlc_pe_int); + if (nlc_pe_int == 0) + GF_ATOMIC_ADD(conf->refd_inodes, 1); + + ret = 0; + +out: + return ret; +} + +void +nlc_remove_from_lru(xlator_t *this, inode_t *inode) +{ + nlc_lru_node_t *lru_node = NULL; + nlc_lru_node_t *tmp = NULL; + nlc_lru_node_t *tmp1 = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + list_for_each_entry_safe(lru_node, tmp, &conf->lru, list) + { + if (inode == lru_node->inode) { + list_del(&lru_node->list); + tmp1 = lru_node; + break; + } + } + } + UNLOCK(&conf->lock); + + if (tmp1) { + inode_unref(tmp1->inode); + GF_FREE(tmp1); + } + + return; +} + +void +nlc_lru_prune(xlator_t *this, inode_t *inode) +{ + nlc_lru_node_t *lru_node = NULL; + nlc_lru_node_t *prune_node = NULL; + nlc_lru_node_t *tmp = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + LOCK(&conf->lock); + { + if ((GF_ATOMIC_GET(conf->refd_inodes) < conf->inode_limit) && + (GF_ATOMIC_GET(conf->current_cache_size) < conf->cache_size)) + goto unlock; + + list_for_each_entry_safe(lru_node, tmp, &conf->lru, list) + { + list_del(&lru_node->list); + prune_node = lru_node; + goto unlock; + } + } +unlock: + UNLOCK(&conf->lock); + + if (prune_node) { + nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE); + inode_unref(prune_node->inode); + GF_FREE(prune_node); + } + return; +} + +void +nlc_clear_all_cache(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + struct list_head clear_list; + nlc_lru_node_t *prune_node = NULL; + nlc_lru_node_t *tmp = NULL; + + conf = this->private; + + INIT_LIST_HEAD(&clear_list); + + LOCK(&conf->lock); + { + list_replace_init(&conf->lru, &clear_list); + } + UNLOCK(&conf->lock); + + list_for_each_entry_safe(prune_node, tmp, &clear_list, list) + { + list_del(&prune_node->list); + nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE); + inode_unref(prune_node->inode); + GF_FREE(prune_node); + } + + return; +} + +void +__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe) +{ + uint64_t pe_int = 0; + nlc_conf_t *conf = NULL; + uint64_t nlc_ctx_int = 0; + + conf = this->private; + + if (pe->inode) { + inode_ctx_reset1(pe->inode, this, &pe_int); + inode_ctx_get2(pe->inode, this, &nlc_ctx_int, NULL); + inode_unref(pe->inode); + } + list_del(&pe->list); + + nlc_ctx->cache_size -= sizeof(*pe) + sizeof(pe->name); + GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name))); + + nlc_ctx->refd_inodes -= 1; + if (nlc_ctx_int == 0) + GF_ATOMIC_SUB(conf->refd_inodes, 1); + + GF_FREE(pe->name); + GF_FREE(pe); + + return; +} + +void +__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + list_del(&ne->list); + GF_FREE(ne->name); + GF_FREE(ne); + + nlc_ctx->cache_size -= sizeof(*ne) + sizeof(ne->name); + GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name))); + + return; +} + +void +nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason) +{ + nlc_ctx_t *nlc_ctx = NULL; + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_inode_ctx_timer_delete(this, nlc_ctx); + + __nlc_inode_clear_entries(this, nlc_ctx); + } + UNLOCK(&nlc_ctx->lock); + + if (reason != NLC_LRU_PRUNE) + nlc_remove_from_lru(this, inode); + +out: + return; +} + +static void +__nlc_del_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino, + const char *name, gf_boolean_t multilink) +{ + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + gf_boolean_t found = _gf_false; + uint64_t pe_int = 0; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + if (!entry_ino) + goto name_search; + + /* If there are hardlinks first search names, followed by inodes */ + if (multilink) { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + goto out; + } + } + inode_ctx_reset1(entry_ino, this, &pe_int); + if (pe_int) { + pe = (void *)(long)(pe_int); + found = _gf_true; + goto out; + } + goto out; + } + + inode_ctx_reset1(entry_ino, this, &pe_int); + if (pe_int) { + pe = (void *)(long)(pe_int); + found = _gf_true; + goto out; + } + +name_search: + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + break; + /* TODO: can there be duplicates? */ + } + } + +out: + if (found) + __nlc_free_pe(this, nlc_ctx, pe); + + return; +} + +static void +__nlc_del_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name) +{ + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp = NULL; + + if (!IS_NE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list) + { + if (strcmp(ne->name, name) == 0) { + __nlc_free_ne(this, nlc_ctx, ne); + break; + } + } +out: + return; +} + +static void +__nlc_add_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino, + const char *name) +{ + nlc_pe_t *pe = NULL; + int ret = -1; + nlc_conf_t *conf = NULL; + uint64_t nlc_ctx_int = 0; + + conf = this->private; + + /* TODO: There can be no duplicate entries, as it is added only + during create. In case there arises duplicate entries, search PE + found = __nlc_search (entries, name, _gf_false); + can use bit vector to have simple search than sequential search */ + + pe = GF_CALLOC(sizeof(*pe), 1, gf_nlc_mt_nlc_pe_t); + if (!pe) + goto out; + + if (entry_ino) { + pe->inode = inode_ref(entry_ino); + nlc_inode_ctx_set(this, entry_ino, NULL, pe); + } else if (name) { + pe->name = gf_strdup(name); + if (!pe->name) + goto out; + } + + list_add(&pe->list, &nlc_ctx->pe); + + nlc_ctx->cache_size += sizeof(*pe) + sizeof(pe->name); + GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name))); + + nlc_ctx->refd_inodes += 1; + inode_ctx_get2(entry_ino, this, &nlc_ctx_int, NULL); + if (nlc_ctx_int == 0) + GF_ATOMIC_ADD(conf->refd_inodes, 1); + + ret = 0; +out: + if (ret) + GF_FREE(pe); + + return; +} + +static void +__nlc_add_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name) +{ + nlc_ne_t *ne = NULL; + int ret = -1; + nlc_conf_t *conf = NULL; + + conf = this->private; + + /* TODO: search ne before adding to get rid of duplicate entries + found = __nlc_search (entries, name, _gf_false); + can use bit vector to have faster search than sequential search */ + + ne = GF_CALLOC(sizeof(*ne), 1, gf_nlc_mt_nlc_ne_t); + if (!ne) + goto out; + + ne->name = gf_strdup(name); + if (!ne->name) + goto out; + + list_add(&ne->list, &nlc_ctx->ne); + + nlc_ctx->cache_size += sizeof(*ne) + sizeof(ne->name); + GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name))); + ret = 0; +out: + if (ret) + GF_FREE(ne); + + return; +} + +void +nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + /* There is one possibility where we need to search before + * adding NE: when there are two parallel lookups on a non + * existent file */ + if (!__nlc_search_ne(nlc_ctx, name)) { + __nlc_add_ne(this, nlc_ctx, name); + __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID); + } + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +void +nlc_dir_remove_pe(xlator_t *this, inode_t *parent, inode_t *entry_ino, + const char *name, gf_boolean_t multilink) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (parent->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, parent, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + __nlc_del_pe(this, nlc_ctx, entry_ino, name, multilink); + __nlc_add_ne(this, nlc_ctx, name); + __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID); + } +unlock: + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +void +nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name) +{ + nlc_ctx_t *nlc_ctx = NULL; + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get_set(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + __nlc_del_ne(this, nlc_ctx, name); + __nlc_add_pe(this, nlc_ctx, entry_ino, name); + if (!IS_PE_VALID(nlc_ctx->state)) + __nlc_set_dir_state(nlc_ctx, NLC_PE_PARTIAL); + } + UNLOCK(&nlc_ctx->lock); +out: + return; +} + +gf_boolean_t +__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name) +{ + gf_boolean_t found = _gf_false; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp = NULL; + + if (!IS_NE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list) + { + if (strcmp(ne->name, name) == 0) { + found = _gf_true; + break; + } + } +out: + return found; +} + +static gf_boolean_t +__nlc_search_pe(nlc_ctx_t *nlc_ctx, const char *name) +{ + gf_boolean_t found = _gf_false; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = _gf_true; + break; + } + } +out: + return found; +} + +static char * +__nlc_get_pe(nlc_ctx_t *nlc_ctx, const char *name, + gf_boolean_t case_insensitive) +{ + char *found = NULL; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + + if (!IS_PE_VALID(nlc_ctx->state)) + goto out; + + if (case_insensitive) { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcasecmp(pe->name, name) == 0)) { + found = pe->name; + break; + } + } + } else { + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + if (pe->name && (strcmp(pe->name, name) == 0)) { + found = pe->name; + break; + } + } + } +out: + return found; +} + +gf_boolean_t +nlc_is_negative_lookup(xlator_t *this, loc_t *loc) +{ + nlc_ctx_t *nlc_ctx = NULL; + inode_t *inode = NULL; + gf_boolean_t neg_entry = _gf_false; + + inode = loc->parent; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + if (__nlc_search_ne(nlc_ctx, loc->name)) { + neg_entry = _gf_true; + goto unlock; + } + if ((nlc_ctx->state & NLC_PE_FULL) && + !__nlc_search_pe(nlc_ctx, loc->name)) { + neg_entry = _gf_true; + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); + +out: + return neg_entry; +} + +gf_boolean_t +nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname, + int32_t *op_ret, int32_t *op_errno, dict_t *dict) +{ + nlc_ctx_t *nlc_ctx = NULL; + inode_t *inode = NULL; + gf_boolean_t hit = _gf_false; + char *found_file = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO(this->name, loc, out); + GF_VALIDATE_OR_GOTO(this->name, fname, out); + GF_VALIDATE_OR_GOTO(this->name, op_ret, out); + GF_VALIDATE_OR_GOTO(this->name, op_errno, out); + GF_VALIDATE_OR_GOTO(this->name, dict, out); + + inode = loc->inode; + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + if (inode->ia_type != IA_IFDIR) { + gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL, + "inode is not of type dir"); + goto out; + } + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + if (!nlc_ctx) + goto out; + + LOCK(&nlc_ctx->lock); + { + if (!__nlc_is_cache_valid(this, nlc_ctx)) + goto unlock; + + found_file = __nlc_get_pe(nlc_ctx, fname, _gf_true); + if (found_file) { + ret = dict_set_dynstr(dict, GF_XATTR_GET_REAL_FILENAME_KEY, + gf_strdup(found_file)); + if (ret < 0) + goto unlock; + *op_ret = strlen(found_file) + 1; + hit = _gf_true; + goto unlock; + } + if (!found_file && (nlc_ctx->state & NLC_PE_FULL)) { + *op_ret = -1; + *op_errno = ENOENT; + hit = _gf_true; + goto unlock; + } + } +unlock: + UNLOCK(&nlc_ctx->lock); + +out: + return hit; +} + +void +nlc_dump_inodectx(xlator_t *this, inode_t *inode) +{ + int32_t ret = -1; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char uuid_str[64] = { + 0, + }; + nlc_ctx_t *nlc_ctx = NULL; + nlc_pe_t *pe = NULL; + nlc_pe_t *tmp = NULL; + nlc_ne_t *ne = NULL; + nlc_ne_t *tmp1 = NULL; + + nlc_inode_ctx_get(this, inode, &nlc_ctx); + + if (!nlc_ctx) + goto out; + + ret = TRY_LOCK(&nlc_ctx->lock); + if (!ret) { + gf_proc_dump_build_key(key_prefix, "xlator.performance.nl-cache", + "nlc_inode"); + gf_proc_dump_add_section("%s", key_prefix); + + __inode_path(inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } + + uuid_utoa_r(inode->gfid, uuid_str); + + gf_proc_dump_write("inode", "%p", inode); + gf_proc_dump_write("gfid", "%s", uuid_str); + + gf_proc_dump_write("state", "%" PRIu64, nlc_ctx->state); + gf_proc_dump_write("timer", "%p", nlc_ctx->timer); + gf_proc_dump_write("cache-time", "%ld", nlc_ctx->cache_time); + gf_proc_dump_write("cache-size", "%zu", nlc_ctx->cache_size); + gf_proc_dump_write("refd-inodes", "%" PRIu64, nlc_ctx->refd_inodes); + + if (IS_PE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list) + { + gf_proc_dump_write("pe", "%p, %p, %s", pe, pe->inode, pe->name); + } + + if (IS_NE_VALID(nlc_ctx->state)) + list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list) + { + gf_proc_dump_write("ne", "%s", ne->name); + } + + UNLOCK(&nlc_ctx->lock); + } + + if (ret && nlc_ctx) + gf_proc_dump_write("Unable to dump the inode information", + "(Lock acquisition failed) %p (gfid: %s)", nlc_ctx, + uuid_str); +out: + return; +} diff --git a/xlators/performance/nl-cache/src/nl-cache-mem-types.h b/xlators/performance/nl-cache/src/nl-cache-mem-types.h new file mode 100644 index 00000000000..93a17b3fd5a --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-mem-types.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_MEM_TYPES_H__ +#define __NL_CACHE_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_nlc_mem_types_ { + gf_nlc_mt_nlc_conf_t = gf_common_mt_end + 1, + gf_nlc_mt_nlc_ctx_t, + gf_nlc_mt_nlc_local_t, + gf_nlc_mt_nlc_pe_t, + gf_nlc_mt_nlc_ne_t, + gf_nlc_mt_nlc_timer_data_t, + gf_nlc_mt_nlc_lru_node, + gf_nlc_mt_end +}; + +#endif /* __NL_CACHE_MEM_TYPES_H__ */ diff --git a/xlators/performance/nl-cache/src/nl-cache-messages.h b/xlators/performance/nl-cache/src/nl-cache-messages.h new file mode 100644 index 00000000000..222d709e133 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache-messages.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_MESSAGES_H__ +#define __NL_CACHE_MESSAGES_H__ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(NLC, NLC_MSG_NO_MEMORY, NLC_MSG_EINVAL, NLC_MSG_NO_TIMER_WHEEL, + NLC_MSG_DICT_FAILURE); + +#endif /* __NL_CACHE_MESSAGES_H__ */ diff --git a/xlators/performance/nl-cache/src/nl-cache.c b/xlators/performance/nl-cache/src/nl-cache.c new file mode 100644 index 00000000000..33a7c471663 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache.c @@ -0,0 +1,840 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#include "nl-cache.h" +#include <glusterfs/statedump.h> +#include <glusterfs/upcall-utils.h> + +static void +nlc_dentry_op(call_frame_t *frame, xlator_t *this, gf_boolean_t multilink) +{ + nlc_local_t *local = frame->local; + + GF_VALIDATE_OR_GOTO(this->name, local, out); + + switch (local->fop) { + case GF_FOP_MKDIR: + nlc_set_dir_state(this, local->loc.inode, NLC_PE_FULL); + /*fall-through*/ + case GF_FOP_MKNOD: + case GF_FOP_CREATE: + case GF_FOP_SYMLINK: + nlc_dir_add_pe(this, local->loc.parent, local->loc.inode, + local->loc.name); + break; + case GF_FOP_LINK: + nlc_dir_add_pe(this, local->loc2.parent, NULL, local->loc2.name); + break; + case GF_FOP_RMDIR: + nlc_inode_clear_cache(this, local->loc.inode, _gf_false); + /*fall-through*/ + case GF_FOP_UNLINK: + nlc_dir_remove_pe(this, local->loc.parent, local->loc.inode, + local->loc.name, multilink); + break; + case GF_FOP_RENAME: + /* TBD: Should these be atomic ? In case of rename, the + * newloc->inode can be NULL, and hence use oldloc->inode */ + nlc_dir_remove_pe(this, local->loc2.parent, local->loc2.inode, + local->loc2.name, _gf_false); + + /*TODO: Remove old dentry from destination before adding this pe*/ + nlc_dir_add_pe(this, local->loc.parent, local->loc2.inode, + local->loc.name); + + default: + return; + } + + nlc_lru_prune(this, NULL); +out: + return; +} + +#define NLC_FOP(_name, _op, loc1, loc2, frame, this, args...) \ + do { \ + nlc_local_t *__local = NULL; \ + nlc_conf_t *conf = NULL; \ + \ + conf = this->private; \ + \ + if (!IS_PEC_ENABLED(conf)) \ + goto disabled; \ + \ + __local = nlc_local_init(frame, this, _op, loc1, loc2); \ + GF_VALIDATE_OR_GOTO(this->name, __local, err); \ + \ + STACK_WIND(frame, nlc_##_name##_cbk, FIRST_CHILD(this), \ + FIRST_CHILD(this)->fops->_name, args); \ + break; \ + disabled: \ + default_##_name##_resume(frame, this, args); \ + break; \ + err: \ + default_##_name##_failure_cbk(frame, ENOMEM); \ + break; \ + } while (0) + +#define NLC_FOP_CBK(_name, multilink, frame, cookie, this, op_ret, op_errno, \ + args...) \ + do { \ + nlc_conf_t *conf = NULL; \ + \ + if (op_ret != 0) \ + goto out; \ + \ + conf = this->private; \ + \ + if (op_ret < 0 || !IS_PEC_ENABLED(conf)) \ + goto out; \ + nlc_dentry_op(frame, this, multilink); \ + out: \ + NLC_STACK_UNWIND(_name, frame, op_ret, op_errno, args); \ + } while (0) + +static int32_t +nlc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + NLC_FOP_CBK(rename, _gf_false, frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, postnewparent, + xdata); + return 0; +} + +static int32_t +nlc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + NLC_FOP(rename, GF_FOP_RENAME, newloc, oldloc, frame, this, oldloc, newloc, + xdata); + return 0; +} + +static int32_t +nlc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(mknod, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + NLC_FOP(mknod, GF_FOP_MKNOD, loc, NULL, frame, this, loc, mode, rdev, umask, + xdata); + return 0; +} + +static int32_t +nlc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(create, _gf_false, frame, cookie, this, op_ret, op_errno, fd, + inode, buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + NLC_FOP(create, GF_FOP_CREATE, loc, NULL, frame, this, loc, flags, mode, + umask, fd, xdata); + return 0; +} + +static int32_t +nlc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(mkdir, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + NLC_FOP(mkdir, GF_FOP_MKDIR, loc, NULL, frame, this, loc, mode, umask, + xdata); + return 0; +} + +static int32_t +nlc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + nlc_local_t *local = NULL; + nlc_conf_t *conf = NULL; + + local = frame->local; + conf = this->private; + + if (!local) + goto out; + + /* Donot add to pe, this may lead to duplicate entry and + * requires search before adding if list of strings */ + if (op_ret < 0 && op_errno == ENOENT) { + nlc_dir_add_ne(this, local->loc.parent, local->loc.name); + GF_ATOMIC_INC(conf->nlc_counter.nlc_miss); + } + +out: + NLC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +static int32_t +nlc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + nlc_local_t *local = NULL; + nlc_conf_t *conf = NULL; + inode_t *inode = NULL; + + if (loc_is_nameless(loc)) + goto wind; + + local = nlc_local_init(frame, this, GF_FOP_LOOKUP, loc, NULL); + if (!local) + goto err; + + conf = this->private; + + inode = inode_grep(loc->inode->table, loc->parent, loc->name); + if (inode) { + inode_unref(inode); + goto wind; + } + + if (nlc_is_negative_lookup(this, loc)) { + GF_ATOMIC_INC(conf->nlc_counter.nlc_hit); + gf_msg_trace(this->name, 0, + "Serving negative lookup from " + "cache:%s", + loc->name); + goto unwind; + } + +wind: + STACK_WIND(frame, nlc_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +unwind: + NLC_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); + return 0; +err: + NLC_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + return 0; +} + +static int32_t +nlc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + NLC_FOP_CBK(rmdir, _gf_false, frame, cookie, this, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) +{ + NLC_FOP(rmdir, GF_FOP_RMDIR, loc, NULL, frame, this, loc, flags, xdata); + return 0; +} + +static int32_t +nlc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + if (!IS_PEC_ENABLED(conf)) + goto out; + + if (op_ret < 0 && op_errno == ENOENT) { + GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_miss); + } + +out: + NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +static int32_t +nlc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + dict_t *dict = NULL; + nlc_local_t *local = NULL; + gf_boolean_t hit = _gf_false; + const char *fname = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + if (!IS_PEC_ENABLED(conf)) + goto wind; + + if (!key || (strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY, + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) != 0)) + goto wind; + + local = nlc_local_init(frame, this, GF_FOP_GETXATTR, loc, NULL); + if (!local) + goto err; + + if (loc->inode && key) { + dict = dict_new(); + if (!dict) + goto err; + + fname = key + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY); + hit = nlc_get_real_file_name(this, loc, fname, &op_ret, &op_errno, + dict); + if (hit) + goto unwind; + else + dict_unref(dict); + } + + STACK_WIND(frame, nlc_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +wind: + STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +unwind: + GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_hit); + NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, NULL); + dict_unref(dict); + return 0; +err: + NLC_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +nlc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(symlink, _gf_false, frame, cookie, this, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + NLC_FOP(symlink, GF_FOP_SYMLINK, loc, NULL, frame, this, linkpath, loc, + umask, xdata); + return 0; +} + +static int32_t +nlc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + NLC_FOP_CBK(link, _gf_false, frame, cookie, this, op_ret, op_errno, inode, + buf, preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + NLC_FOP(link, GF_FOP_LINK, oldloc, newloc, frame, this, oldloc, newloc, + xdata); + return 0; +} + +static int32_t +nlc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + uint32_t link_count = 0; + gf_boolean_t multilink = _gf_false; + + if (xdata && !dict_get_uint32(xdata, GET_LINK_COUNT, &link_count)) { + if (link_count > 1) + multilink = _gf_true; + } else { + /* Don't touch cache if we don't know enough */ + gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE, + "Failed to get GET_LINK_COUNT from dict"); + NLC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent, + xdata); + return 0; + } + + NLC_FOP_CBK(unlink, multilink, frame, cookie, this, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +static int32_t +nlc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata) +{ + nlc_conf_t *conf = NULL; + gf_boolean_t new_dict = _gf_false; + + conf = this->private; + + if (!IS_PEC_ENABLED(conf)) + goto do_fop; + + if (!xdata) { + xdata = dict_new(); + if (xdata) + new_dict = _gf_true; + } + + if (xdata && dict_set_uint32(xdata, GET_LINK_COUNT, 0)) { + gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE, + "Failed to set GET_LINK_COUNT in dict"); + goto err; + } + +do_fop: + NLC_FOP(unlink, GF_FOP_UNLINK, loc, NULL, frame, this, loc, flags, xdata); + + if (new_dict) + dict_unref(xdata); + return 0; +} + +static int32_t +nlc_invalidate(xlator_t *this, void *data) +{ + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + inode_t *parent1 = NULL; + inode_t *parent2 = NULL; + int ret = 0; + inode_table_t *itable = NULL; + nlc_conf_t *conf = NULL; + + up_data = (struct gf_upcall *)data; + + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + + conf = this->private; + if (!conf) + goto out; + + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + /*TODO: Add he inodes found as a member in gf_upcall_cache_invalidation + * so that it prevents subsequent xlators from doing inode_find again + */ + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; + } + + if ((!((up_ci->flags & UP_TIMES) && inode->ia_type == IA_IFDIR)) && + (!(up_ci->flags & UP_PARENT_DENTRY_FLAGS))) { + goto out; + } + + if (!gf_uuid_is_null(up_ci->p_stat.ia_gfid)) { + parent1 = inode_find(itable, up_ci->p_stat.ia_gfid); + if (!parent1) { + ret = -1; + goto out; + } + } + + if (!gf_uuid_is_null(up_ci->oldp_stat.ia_gfid)) { + parent2 = inode_find(itable, up_ci->oldp_stat.ia_gfid); + if (!parent2) { + ret = -1; + goto out; + } + } + + /* TODO: get enough data in upcall so that we do not invalidate but + * update */ + if (inode && inode->ia_type == IA_IFDIR) + nlc_inode_clear_cache(this, inode, NLC_NONE); + if (parent1) + nlc_inode_clear_cache(this, parent1, NLC_NONE); + if (parent2) + nlc_inode_clear_cache(this, parent2, NLC_NONE); + + GF_ATOMIC_INC(conf->nlc_counter.nlc_invals); + +out: + if (inode) + inode_unref(inode); + if (parent1) + inode_unref(parent1); + if (parent2) + inode_unref(parent2); + + return ret; +} + +int +nlc_notify(xlator_t *this, int event, void *data, ...) +{ + int ret = 0; + + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + case GF_EVENT_CHILD_UP: + case GF_EVENT_SOME_DESCENDENT_UP: + nlc_update_child_down_time(this, gf_time()); + /* TODO: nlc_clear_all_cache (this); else + lru prune will lazily clear it*/ + break; + case GF_EVENT_UPCALL: + ret = nlc_invalidate(this, data); + break; + case GF_EVENT_PARENT_DOWN: + nlc_disable_cache(this); + nlc_clear_all_cache(this); + default: + break; + } + + if (default_notify(this, event, data) != 0) + ret = -1; + + return ret; +} + +static int32_t +nlc_forget(xlator_t *this, inode_t *inode) +{ + uint64_t pe_int = 0; + uint64_t nlc_ctx_int = 0; + nlc_ctx_t *nlc_ctx = NULL; + nlc_conf_t *conf = NULL; + + conf = this->private; + + inode_ctx_reset1(inode, this, &pe_int); + GF_ASSERT(pe_int == 0); + + nlc_inode_clear_cache(this, inode, NLC_NONE); + inode_ctx_reset0(inode, this, &nlc_ctx_int); + nlc_ctx = (void *)(long)nlc_ctx_int; + if (nlc_ctx) { + GF_FREE(nlc_ctx); + GF_ATOMIC_SUB(conf->current_cache_size, sizeof(*nlc_ctx)); + } + + return 0; +} + +static int32_t +nlc_inodectx(xlator_t *this, inode_t *inode) +{ + nlc_dump_inodectx(this, inode); + return 0; +} + +static int32_t +nlc_priv_dump(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("negative_lookup_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_hit)); + gf_proc_dump_write("negative_lookup_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_miss)); + gf_proc_dump_write("get_real_filename_hit_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit)); + gf_proc_dump_write("get_real_filename_miss_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss)); + gf_proc_dump_write("nameless_lookup_count", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup)); + gf_proc_dump_write("inodes_with_positive_dentry_cache", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt)); + gf_proc_dump_write("inodes_with_negative_dentry_cache", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt)); + gf_proc_dump_write("dentry_invalidations_received", "%" PRId64, + GF_ATOMIC_GET(conf->nlc_counter.nlc_invals)); + gf_proc_dump_write("cache_limit", "%" PRIu64, conf->cache_size); + gf_proc_dump_write("consumed_cache_size", "%" PRId64, + GF_ATOMIC_GET(conf->current_cache_size)); + gf_proc_dump_write("inode_limit", "%" PRIu64, conf->inode_limit); + gf_proc_dump_write("consumed_inodes", "%" PRId64, + GF_ATOMIC_GET(conf->refd_inodes)); + + return 0; +} + +static int32_t +nlc_dump_metrics(xlator_t *this, int fd) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + dprintf(fd, "%s.negative_lookup_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_hit)); + dprintf(fd, "%s.negative_lookup_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_miss)); + dprintf(fd, "%s.get_real_filename_hit_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit)); + dprintf(fd, "%s.get_real_filename_miss_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss)); + dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup)); + dprintf(fd, "%s.inodes_with_positive_dentry_cache %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt)); + dprintf(fd, "%s.inodes_with_negative_dentry_cache %" PRId64 "\n", + this->name, GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt)); + dprintf(fd, "%s.dentry_invalidations_received %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->nlc_counter.nlc_invals)); + dprintf(fd, "%s.cache_limit %" PRIu64 "\n", this->name, conf->cache_size); + dprintf(fd, "%s.consumed_cache_size %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->current_cache_size)); + dprintf(fd, "%s.inode_limit %" PRIu64 "\n", this->name, conf->inode_limit); + dprintf(fd, "%s.consumed_inodes %" PRId64 "\n", this->name, + GF_ATOMIC_GET(conf->refd_inodes)); + + return 0; +} + +void +nlc_fini(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + GF_FREE(conf); + + glusterfs_ctx_tw_put(this->ctx); + + return; +} + +int32_t +nlc_mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_nlc_mt_end + 1); + return ret; +} + +int32_t +nlc_reconfigure(xlator_t *this, dict_t *options) +{ + nlc_conf_t *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF("nl-cache-timeout", conf->cache_timeout, options, int32, + out); + GF_OPTION_RECONF("nl-cache-positive-entry", conf->positive_entry_cache, + options, bool, out); + GF_OPTION_RECONF("nl-cache-limit", conf->cache_size, options, size_uint64, + out); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + +out: + return 0; +} + +int32_t +nlc_init(xlator_t *this) +{ + nlc_conf_t *conf = NULL; + int ret = -1; + inode_table_t *itable = NULL; + + conf = GF_CALLOC(sizeof(*conf), 1, gf_nlc_mt_nlc_conf_t); + if (!conf) + goto out; + + GF_OPTION_INIT("nl-cache-timeout", conf->cache_timeout, int32, out); + GF_OPTION_INIT("nl-cache-positive-entry", conf->positive_entry_cache, bool, + out); + GF_OPTION_INIT("nl-cache-limit", conf->cache_size, size_uint64, out); + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + /* Since the positive entries are stored as list of refs on + * existing inodes, we should not overflow the inode lru_limit. + * Hence keep the limit of inodes that are refed by this xlator, + * to 80% of inode_table->lru_limit. In fuse where the limit is + * infinite, take 131072 as lru limit (as in gfapi). */ + itable = ((xlator_t *)this->graph->top)->itable; + if (itable && itable->lru_limit) + conf->inode_limit = itable->lru_limit * 80 / 100; + else + conf->inode_limit = 131072 * 80 / 100; + + LOCK_INIT(&conf->lock); + GF_ATOMIC_INIT(conf->current_cache_size, 0); + GF_ATOMIC_INIT(conf->refd_inodes, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_hit, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_miss, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nameless_lookup, 0); + GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_hit, 0); + GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_miss, 0); + GF_ATOMIC_INIT(conf->nlc_counter.pe_inode_cnt, 0); + GF_ATOMIC_INIT(conf->nlc_counter.ne_inode_cnt, 0); + GF_ATOMIC_INIT(conf->nlc_counter.nlc_invals, 0); + + INIT_LIST_HEAD(&conf->lru); + conf->last_child_down = gf_time(); + + conf->timer_wheel = glusterfs_ctx_tw_get(this->ctx); + if (!conf->timer_wheel) { + gf_msg(this->name, GF_LOG_ERROR, 0, NLC_MSG_NO_TIMER_WHEEL, + "Initing the global timer wheel failed"); + goto out; + } + + this->private = conf; + + ret = 0; +out: + if (ret < 0) + GF_FREE(conf); + + return ret; +} + +struct xlator_fops nlc_fops = { + .rename = nlc_rename, + .mknod = nlc_mknod, + .create = nlc_create, + .mkdir = nlc_mkdir, + .lookup = nlc_lookup, + .rmdir = nlc_rmdir, + .getxattr = nlc_getxattr, + .symlink = nlc_symlink, + .link = nlc_link, + .unlink = nlc_unlink, + /* TODO: + .readdir = nlc_readdir, + .readdirp = nlc_readdirp, + .seek = nlc_seek, + .opendir = nlc_opendir, */ +}; + +struct xlator_cbks nlc_cbks = { + .forget = nlc_forget, +}; + +struct xlator_dumpops nlc_dumpops = { + .inodectx = nlc_inodectx, + .priv = nlc_priv_dump, +}; + +struct volume_options nlc_options[] = { + { + .key = {"nl-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable nl-cache", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"nl-cache-positive-entry"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Cache the name of the files/directories that was" + " looked up and are present in a directory", + }, + { + .key = {"nl-cache-limit"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .default_value = "131072", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "the value over which caching will be disabled for" + "a while and the cache is cleared based on LRU", + }, + { + .key = {"nl-cache-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .default_value = "60", + .op_version = {GD_OP_VERSION_3_11_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .description = "Time period after which cache has to be refreshed", + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"nl-cache"}, + .description = "Enable/Disable nl cache translator"}, + + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = nlc_init, + .fini = nlc_fini, + .notify = nlc_notify, + .reconfigure = nlc_reconfigure, + .mem_acct_init = nlc_mem_acct_init, + .dump_metrics = nlc_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &nlc_dumpops, + .fops = &nlc_fops, + .cbks = &nlc_cbks, + .options = nlc_options, + .identifier = "nl-cache", + .category = GF_TECH_PREVIEW, +}; diff --git a/xlators/performance/nl-cache/src/nl-cache.h b/xlators/performance/nl-cache/src/nl-cache.h new file mode 100644 index 00000000000..85fcc176342 --- /dev/null +++ b/xlators/performance/nl-cache/src/nl-cache.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + * This file is part of GlusterFS. + * + * This file is licensed to you under your choice of the GNU Lesser + * General Public License, version 3 or any later version (LGPLv3 or + * later), or the GNU General Public License, version 2 (GPLv2), in all + * cases as published by the Free Software Foundation. + */ + +#ifndef __NL_CACHE_H__ +#define __NL_CACHE_H__ + +#include "nl-cache-mem-types.h" +#include "nl-cache-messages.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/defaults.h> +#include <glusterfs/atomic.h> + +#define NLC_INVALID 0x0000 +#define NLC_PE_FULL 0x0001 +#define NLC_PE_PARTIAL 0x0002 +#define NLC_NE_VALID 0x0004 + +#define IS_PE_VALID(state) \ + ((state != NLC_INVALID) && (state & (NLC_PE_FULL | NLC_PE_PARTIAL))) +#define IS_NE_VALID(state) ((state != NLC_INVALID) && (state & NLC_NE_VALID)) + +#define IS_PEC_ENABLED(conf) (conf->positive_entry_cache) +#define IS_CACHE_ENABLED(conf) ((!conf->cache_disabled)) + +#define NLC_STACK_UNWIND(fop, frame, params...) \ + do { \ + nlc_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + nlc_local_wipe(__xl, __local); \ + } while (0) + +enum nlc_cache_clear_reason { + NLC_NONE = 0, + NLC_LRU_PRUNE, +}; + +struct nlc_ne { + struct list_head list; + char *name; +}; +typedef struct nlc_ne nlc_ne_t; + +struct nlc_pe { + struct list_head list; + inode_t *inode; + char *name; +}; +typedef struct nlc_pe nlc_pe_t; + +struct nlc_timer_data { + inode_t *inode; + xlator_t *this; +}; +typedef struct nlc_timer_data nlc_timer_data_t; + +struct nlc_lru_node { + inode_t *inode; + struct list_head list; +}; +typedef struct nlc_lru_node nlc_lru_node_t; + +struct nlc_ctx { + struct list_head pe; /* list of positive entries */ + struct list_head ne; /* list of negative entries */ + uint64_t state; + time_t cache_time; + struct gf_tw_timer_list *timer; + nlc_timer_data_t *timer_data; + size_t cache_size; + uint64_t refd_inodes; + gf_lock_t lock; +}; +typedef struct nlc_ctx nlc_ctx_t; + +struct nlc_local { + loc_t loc; + loc_t loc2; + inode_t *inode; + inode_t *parent; + fd_t *fd; + char *linkname; + glusterfs_fop_t fop; +}; +typedef struct nlc_local nlc_local_t; + +struct nlc_statistics { + gf_atomic_t nlc_hit; /* No. of times lookup/stat was served from this xl */ + gf_atomic_t nlc_miss; /* No. of times negative lookups were sent to disk */ + /* More granular counters */ + gf_atomic_t nameless_lookup; + gf_atomic_t getrealfilename_hit; + gf_atomic_t getrealfilename_miss; + gf_atomic_t pe_inode_cnt; + gf_atomic_t ne_inode_cnt; + gf_atomic_t nlc_invals; /* No. of invalidates received from upcall*/ +}; + +struct nlc_conf { + int32_t cache_timeout; + gf_boolean_t positive_entry_cache; + gf_boolean_t negative_entry_cache; + gf_boolean_t disable_cache; + uint64_t cache_size; + gf_atomic_t current_cache_size; + uint64_t inode_limit; + gf_atomic_t refd_inodes; + struct tvec_base *timer_wheel; + time_t last_child_down; + struct list_head lru; + gf_lock_t lock; + struct nlc_statistics nlc_counter; +}; +typedef struct nlc_conf nlc_conf_t; + +gf_boolean_t +nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname, + int32_t *op_ret, int32_t *op_errno, dict_t *dict); + +gf_boolean_t +nlc_is_negative_lookup(xlator_t *this, loc_t *loc); + +void +nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state); + +void +nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name); + +void +nlc_dir_remove_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino, + const char *name, gf_boolean_t multilink); + +void +nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name); + +void +nlc_local_wipe(xlator_t *this, nlc_local_t *local); + +nlc_local_t * +nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, loc_t *loc2); + +void +nlc_update_child_down_time(xlator_t *this, time_t now); + +void +nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason); + +void +nlc_dump_inodectx(xlator_t *this, inode_t *inode); + +void +nlc_clear_all_cache(xlator_t *this); + +void +nlc_disable_cache(xlator_t *this); + +void +nlc_lru_prune(xlator_t *this, inode_t *inode); + +#endif /* __NL_CACHE_H__ */ diff --git a/xlators/performance/open-behind/Makefile.am b/xlators/performance/open-behind/Makefile.am new file mode 100644 index 00000000000..af437a64d6d --- /dev/null +++ b/xlators/performance/open-behind/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am new file mode 100644 index 00000000000..41930dcd67d --- /dev/null +++ b/xlators/performance/open-behind/src/Makefile.am @@ -0,0 +1,16 @@ +xlator_LTLIBRARIES = open-behind.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +open_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +open_behind_la_SOURCES = open-behind.c +open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = open-behind-mem-types.h open-behind-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h new file mode 100644 index 00000000000..6c1ab2e19d2 --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __OB_MEM_TYPES_H__ +#define __OB_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_ob_mem_types_ { + gf_ob_mt_fd_t = gf_common_mt_end + 1, + gf_ob_mt_conf_t, + gf_ob_mt_inode_t, + gf_ob_mt_end +}; +#endif diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h new file mode 100644 index 00000000000..0e789177684 --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind-messages.h @@ -0,0 +1,32 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _OPEN_BEHIND_MESSAGES_H_ +#define _OPEN_BEHIND_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, + OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY, + OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE); + +#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop" +#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state" + +#endif /* _OPEN_BEHIND_MESSAGES_H_ */ diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c new file mode 100644 index 00000000000..600c3b62ffe --- /dev/null +++ b/xlators/performance/open-behind/src/open-behind.c @@ -0,0 +1,1101 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "open-behind-mem-types.h" +#include <glusterfs/xlator.h> +#include <glusterfs/statedump.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include "open-behind-messages.h" +#include <glusterfs/glusterfs-acl.h> + +/* Note: The initial design of open-behind was made to cover the simple case + * of open, read, close for small files. This pattern combined with + * quick-read can do the whole operation without a single request to the + * bricks (except the initial lookup). + * + * The way to do this has been improved, but the logic remains the same. + * Basically, this means that any operation sent to the fd or the inode + * that it's not a read, causes the open request to be sent to the + * bricks, and all future operations will be executed synchronously, + * including opens (it's reset once all fd's are closed). + */ + +typedef struct ob_conf { + gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe + e.g - fstat() readv() + + whereas for fops like writev(), lk(), + the fd is important for side effects + like mandatory locks + */ + gf_boolean_t lazy_open; /* delay backend open as much as possible */ + gf_boolean_t read_after_open; /* instead of sending readvs on + anonymous fds, open the file + first and then send readv i.e + similar to what writev does + */ +} ob_conf_t; + +/* A negative state represents an errno value negated. In this case the + * current operation cannot be processed. */ +typedef enum _ob_state { + /* There are no opens on the inode or the first open is already + * completed. The current operation can be sent directly. */ + OB_STATE_READY = 0, + + /* There's an open pending and it has been triggered. The current + * operation should be "stubbified" and processed with + * ob_stub_dispatch(). */ + OB_STATE_OPEN_TRIGGERED, + + /* There's an open pending but it has not been triggered. The current + * operation can be processed directly but using an anonymous fd. */ + OB_STATE_OPEN_PENDING, + + /* The current operation is the first open on the inode. */ + OB_STATE_FIRST_OPEN +} ob_state_t; + +typedef struct ob_inode { + /* List of stubs pending on the first open. Once the first open is + * complete, all these stubs will be resubmitted, and dependencies + * will be checked again. */ + struct list_head resume_fops; + + /* The inode this object references. */ + inode_t *inode; + + /* The fd from the first open sent to this inode. It will be set + * from the moment the open is processed until the open if fully + * executed or closed before actually opened. It's NULL in all + * other cases. */ + fd_t *first_fd; + + /* The stub from the first open operation. When open fop starts + * being processed, it's assigned the OB_OPEN_PREPARING value + * until the actual stub is created. This is necessary to avoid + * creating the stub inside a locked region. Once the stub is + * successfully created, it's assigned here. This value is set + * to NULL once the stub is resumed. */ + call_stub_t *first_open; + + /* The total number of currently open fd's on this inode. */ + int32_t open_count; + + /* This flag is set as soon as we know that the open will be + * sent to the bricks, even before the stub is ready. */ + bool triggered; +} ob_inode_t; + +/* Dummy pointer used temporarily while the actual open stub is being created */ +#define OB_OPEN_PREPARING ((call_stub_t *)-1) + +#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...) \ + case OB_STATE_FIRST_OPEN: \ + gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE, \ + "fop=%s", #_fop, "state=%d", __ob_state, NULL); \ + default_##_fop##_failure_cbk(_frame, EINVAL); \ + break; \ + case OB_STATE_READY: \ + default_##_fop(_frame, _xl, ##_args); \ + break; \ + case OB_STATE_OPEN_TRIGGERED: { \ + call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop, \ + ##_args); \ + if (__ob_stub != NULL) { \ + ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub); \ + break; \ + } \ + __ob_state = -ENOMEM; \ + } \ + default: \ + gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state, \ + OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL); \ + default_##_fop##_failure_cbk(_frame, -__ob_state) + +#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_fd( \ + _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + if (!(_trigger)) { \ + fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode, \ + (_fd)->flags); \ + if (__ob_fd != NULL) { \ + default_##_fop(_frame, _xl, ##_args); \ + fd_unref(__ob_fd); \ + break; \ + } \ + __ob_state = -ENOMEM; \ + } \ + OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +#define OB_POST_FLUSH(_xl, _frame, _fd, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_fd( \ + _xl, _fd, 0, true, false, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL); \ + break; \ + OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...) \ + do { \ + ob_inode_t *__ob_inode; \ + fd_t *__first_fd; \ + ob_state_t __ob_state = ob_open_and_resume_inode( \ + _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd); \ + switch (__ob_state) { \ + case OB_STATE_OPEN_PENDING: \ + OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ + } \ + } while (0) + +static ob_inode_t * +ob_inode_get_locked(xlator_t *this, inode_t *inode) +{ + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; + + if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) { + return (ob_inode_t *)(uintptr_t)value; + } + + ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); + if (ob_inode != NULL) { + ob_inode->inode = inode; + INIT_LIST_HEAD(&ob_inode->resume_fops); + + value = (uint64_t)(uintptr_t)ob_inode; + if (__inode_ctx_set(inode, this, &value) < 0) { + GF_FREE(ob_inode); + ob_inode = NULL; + } + } + + return ob_inode; +} + +static ob_state_t +ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd, + int32_t open_count, bool synchronous, bool trigger, + ob_inode_t **pob_inode, fd_t **pfd) +{ + ob_conf_t *conf; + ob_inode_t *ob_inode; + call_stub_t *open_stub; + + if (inode == NULL) { + return OB_STATE_READY; + } + + conf = xl->private; + + *pfd = NULL; + + LOCK(&inode->lock); + { + ob_inode = ob_inode_get_locked(xl, inode); + if (ob_inode == NULL) { + UNLOCK(&inode->lock); + + return -ENOMEM; + } + *pob_inode = ob_inode; + + ob_inode->open_count += open_count; + + /* If first_fd is not NULL, it means that there's a previous open not + * yet completed. */ + if (ob_inode->first_fd != NULL) { + *pfd = ob_inode->first_fd; + /* If the current request doesn't trigger the open and it hasn't + * been triggered yet, we can continue without issuing the open + * only if the current request belongs to the same fd as the + * first one. */ + if (!trigger && !ob_inode->triggered && + (ob_inode->first_fd == fd)) { + UNLOCK(&inode->lock); + + return OB_STATE_OPEN_PENDING; + } + + /* We need to issue the open. It could have already been triggered + * before. In this case open_stub will be NULL. Or the initial open + * may not be completely ready yet. In this case open_stub will be + * OB_OPEN_PREPARING. */ + open_stub = ob_inode->first_open; + ob_inode->first_open = NULL; + ob_inode->triggered = true; + + UNLOCK(&inode->lock); + + if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) { + call_resume(open_stub); + } + + return OB_STATE_OPEN_TRIGGERED; + } + + /* There's no pending open. Only opens can be non synchronous, so all + * regular fops will be processed directly. For non synchronous opens, + * we'll still process them normally (i.e. synchornous) if there are + * more file descriptors open. */ + if (synchronous || (ob_inode->open_count > open_count)) { + UNLOCK(&inode->lock); + + return OB_STATE_READY; + } + + *pfd = fd; + + /* This is the first open. We keep a reference on the fd and set + * first_open stub to OB_OPEN_PREPARING until the actual stub can + * be assigned (we don't create the stub here to avoid doing memory + * allocations inside the mutex). */ + ob_inode->first_fd = __fd_ref(fd); + ob_inode->first_open = OB_OPEN_PREPARING; + + /* If lazy_open is not set, we'll need to immediately send the open, + * so we set triggered right now. */ + ob_inode->triggered = !conf->lazy_open; + } + UNLOCK(&inode->lock); + + return OB_STATE_FIRST_OPEN; +} + +static ob_state_t +ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count, + bool synchronous, bool trigger, ob_inode_t **pob_inode, + fd_t **pfd) +{ + uint64_t err; + + if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) { + return (ob_state_t)-err; + } + + return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous, + trigger, pob_inode, pfd); +} + +static ob_state_t +ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode, + fd_t **pfd) +{ + bool synchronous; + + /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't + * we also execute this open synchronously ? */ + synchronous = (flags & O_TRUNC) != 0; + + return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd); +} + +static int32_t +ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + call_stub_t *stub) +{ + LOCK(&ob_inode->inode->lock); + { + /* We only queue a stub if the open has not been completed or + * cancelled. */ + if (ob_inode->first_fd == fd) { + list_add_tail(&stub->list, &ob_inode->resume_fops); + stub = NULL; + } + } + UNLOCK(&ob_inode->inode->lock); + + if (stub != NULL) { + call_resume(stub); + } + + return 0; +} + +static void +ob_open_destroy(call_stub_t *stub, fd_t *fd) +{ + stub->frame->local = NULL; + STACK_DESTROY(stub->frame->root); + call_stub_destroy(stub); + fd_unref(fd); +} + +static int32_t +ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, + call_stub_t *stub) +{ + bool closed; + + LOCK(&ob_inode->inode->lock); + { + closed = ob_inode->first_fd != fd; + if (!closed) { + if (ob_inode->triggered) { + ob_inode->first_open = NULL; + } else { + ob_inode->first_open = stub; + stub = NULL; + } + } + } + UNLOCK(&ob_inode->inode->lock); + + if (stub != NULL) { + if (closed) { + ob_open_destroy(stub, fd); + } else { + call_resume(stub); + } + } + + return 0; +} + +static void +ob_resume_pending(struct list_head *list) +{ + call_stub_t *stub; + + while (!list_empty(list)) { + stub = list_first_entry(list, call_stub_t, list); + list_del_init(&stub->list); + + call_resume(stub); + } +} + +static void +ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret, + int32_t op_errno) +{ + struct list_head list; + + INIT_LIST_HEAD(&list); + + if (op_ret < 0) { + fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno); + } + + LOCK(&ob_inode->inode->lock); + { + /* Only update the fields if the file has not been closed before + * getting here. */ + if (ob_inode->first_fd == fd) { + list_splice_init(&ob_inode->resume_fops, &list); + ob_inode->first_fd = NULL; + ob_inode->first_open = NULL; + ob_inode->triggered = false; + } + } + UNLOCK(&ob_inode->inode->lock); + + ob_resume_pending(&list); + + fd_unref(fd); +} + +static int32_t +ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + ob_inode_t *ob_inode; + + ob_inode = frame->local; + frame->local = NULL; + + ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno); + + STACK_DESTROY(frame->root); + + return 0; +} + +static int32_t +ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +} + +static int32_t +ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) +{ + ob_inode_t *ob_inode; + call_frame_t *open_frame; + call_stub_t *stub; + fd_t *first_fd; + ob_state_t state; + + state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd); + if (state == OB_STATE_READY) { + /* There's no pending open, but there are other file descriptors opened + * or the current flags require a synchronous open. */ + return default_open(frame, this, loc, flags, fd, xdata); + } + + if (state == OB_STATE_OPEN_TRIGGERED) { + /* The first open is in progress (either because it was already issued + * or because this request triggered it). We try to create a new stub + * to retry the operation once the initial open completes. */ + stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata); + if (stub != NULL) { + return ob_stub_dispatch(this, ob_inode, first_fd, stub); + } + + state = -ENOMEM; + } + + if (state == OB_STATE_FIRST_OPEN) { + /* We try to create a stub for the new open. A new frame needs to be + * used because the current one may be destroyed soon after sending + * the open's reply. */ + open_frame = copy_frame(frame); + if (open_frame != NULL) { + stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd, + xdata); + if (stub != NULL) { + open_frame->local = ob_inode; + + /* TODO: Previous version passed xdata back to the caller, but + * probably this doesn't make sense since it won't contain + * any requested data. I think it would be better to pass + * NULL for xdata. */ + default_open_cbk(frame, NULL, this, 0, 0, fd, xdata); + + return ob_open_dispatch(this, ob_inode, first_fd, stub); + } + + STACK_DESTROY(open_frame->root); + } + + /* In case of error, simulate a regular completion but with an error + * code. */ + ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM); + + state = -ENOMEM; + } + + /* In case of failure we need to decrement the number of open files because + * ob_fdclose() won't be called. */ + + LOCK(&fd->inode->lock); + { + ob_inode->open_count--; + } + UNLOCK(&fd->inode->lock); + + gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", + "open", "path=%s", loc->path, NULL); + + return default_open_failure_cbk(frame, -state); +} + +static int32_t +ob_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + ob_inode_t *ob_inode; + call_stub_t *stub; + fd_t *first_fd; + ob_state_t state; + + /* Create requests are never delayed. We always send them synchronously. */ + state = ob_open_and_resume_fd(this, fd, 1, true, true, &ob_inode, + &first_fd); + if (state == OB_STATE_READY) { + /* There's no pending open, but there are other file descriptors opened + * so we simply forward the request synchronously. */ + return default_create(frame, this, loc, flags, mode, umask, fd, xdata); + } + + if (state == OB_STATE_OPEN_TRIGGERED) { + /* The first open is in progress (either because it was already issued + * or because this request triggered it). We try to create a new stub + * to retry the operation once the initial open completes. */ + stub = fop_create_stub(frame, ob_create, loc, flags, mode, umask, fd, + xdata); + if (stub != NULL) { + return ob_stub_dispatch(this, ob_inode, first_fd, stub); + } + + state = -ENOMEM; + } + + /* Since we forced a synchronous request, OB_STATE_FIRST_OPEN will never + * be returned by ob_open_and_resume_fd(). If we are here it can only be + * because there has been a problem. */ + + /* In case of failure we need to decrement the number of open files because + * ob_fdclose() won't be called. */ + + LOCK(&fd->inode->lock); + { + ob_inode->open_count--; + } + UNLOCK(&fd->inode->lock); + + gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", + "create", "path=%s", loc->path, NULL); + + return default_create_failure_cbk(frame, -state); +} + +static int32_t +ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + ob_conf_t *conf = this->private; + bool trigger = conf->read_after_open || !conf->use_anonymous_fd; + + OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata); + + return 0; +} + +static int32_t +ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags, + iobref, xdata); + + return 0; +} + +static int32_t +ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ob_conf_t *conf = this->private; + bool trigger = !conf->use_anonymous_fd; + + OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata); + + return 0; +} + +static int32_t +ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + ob_conf_t *conf = this->private; + bool trigger = !conf->use_anonymous_fd; + + OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata); + + return 0; +} + +static int32_t +ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + OB_POST_FLUSH(this, frame, fd, fd, xdata); + + return 0; +} + +static int32_t +ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata) +{ + OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata); + + return 0; +} + +static int32_t +ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) +{ + OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata); + + return 0; +} + +static int32_t +ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata); + + return 0; +} + +static int32_t +ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata); + + return 0; +} + +static int32_t +ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata); + + return 0; +} + +static int32_t +ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata); + + return 0; +} + +static int32_t +ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) +{ + OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata); + + return 0; +} + +static int32_t +ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type, + xdata); + + return 0; +} + +static int32_t +ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ + OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata); + + return 0; +} + +static int32_t +ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt, + int valid, dict_t *xdata) +{ + OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata); + + return 0; +} + +static int32_t +ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata); + + return 0; +} + +static int32_t +ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata); + + return 0; +} + +static int32_t +ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata); + + return 0; +} + +static int32_t +ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) +{ + OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata); + + return 0; +} + +static int32_t +ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, + dict_t *xdata) +{ + OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata); + + return 0; +} + +static int32_t +ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid, + xdata); + + return 0; +} + +static int32_t +ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) || + dict_get(dict, POSIX_ACL_ACCESS_XATTR) || + dict_get(dict, GF_SELINUX_XATTR_KEY)) { + return default_setxattr(frame, this, loc, dict, flags, xdata); + } + + OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags, + xdata); + + return 0; +} + +static void +ob_fdclose(xlator_t *this, fd_t *fd) +{ + struct list_head list; + ob_inode_t *ob_inode; + call_stub_t *stub; + + INIT_LIST_HEAD(&list); + stub = NULL; + + LOCK(&fd->inode->lock); + { + ob_inode = ob_inode_get_locked(this, fd->inode); + if (ob_inode != NULL) { + ob_inode->open_count--; + + /* If this fd is the same as ob_inode->first_fd, it means that + * the initial open has not fully completed. We'll try to cancel + * it. */ + if (ob_inode->first_fd == fd) { + if (ob_inode->first_open == OB_OPEN_PREPARING) { + /* In this case ob_open_dispatch() has not been called yet. + * We clear first_fd and first_open to allow that function + * to know that the open is not really needed. This also + * allows other requests to work as expected if they + * arrive before the dispatch function is called. If there + * are pending fops, we can directly process them here. + * (note that there shouldn't be any fd related fops, but + * if there are, it's fine if they fail). */ + ob_inode->first_fd = NULL; + ob_inode->first_open = NULL; + ob_inode->triggered = false; + list_splice_init(&ob_inode->resume_fops, &list); + } else if (!ob_inode->triggered) { + /* If the open has already been dispatched, we can only + * cancel it if it has not been triggered. Otherwise we + * simply wait until it completes. While it's not triggered, + * first_open must be a valid stub and there can't be any + * pending fops. */ + GF_ASSERT((ob_inode->first_open != NULL) && + list_empty(&ob_inode->resume_fops)); + + ob_inode->first_fd = NULL; + stub = ob_inode->first_open; + ob_inode->first_open = NULL; + } + } + } + } + UNLOCK(&fd->inode->lock); + + if (stub != NULL) { + ob_open_destroy(stub, fd); + } + + ob_resume_pending(&list); +} + +int +ob_forget(xlator_t *this, inode_t *inode) +{ + ob_inode_t *ob_inode; + uint64_t value = 0; + + if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { + ob_inode = (ob_inode_t *)(uintptr_t)value; + GF_FREE(ob_inode); + } + + return 0; +} + +int +ob_priv_dump(xlator_t *this) +{ + ob_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + + conf = this->private; + + if (!conf) + return -1; + + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", + "priv"); + + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("use_anonymous_fd", "%d", conf->use_anonymous_fd); + + gf_proc_dump_write("lazy_open", "%d", conf->lazy_open); + + return 0; +} + +int +ob_fdctx_dump(xlator_t *this, fd_t *fd) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + uint64_t value = 0; + int ret = 0, error = 0; + + ret = TRY_LOCK(&fd->lock); + if (ret) + return 0; + + if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) { + error = (int32_t)value; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", + "file"); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("fd", "%p", fd); + + gf_proc_dump_write("error", "%d", error); + + UNLOCK(&fd->lock); + + return 0; +} + +int +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init(this, gf_ob_mt_end + 1); + + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, OPEN_BEHIND_MSG_NO_MEMORY, + "Memory accounting failed"); + + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + ob_conf_t *conf = NULL; + int ret = -1; + + conf = this->private; + + GF_OPTION_RECONF("use-anonymous-fd", conf->use_anonymous_fd, options, bool, + out); + + GF_OPTION_RECONF("lazy-open", conf->lazy_open, options, bool, out); + + GF_OPTION_RECONF("read-after-open", conf->read_after_open, options, bool, + out); + + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); + ret = 0; +out: + return ret; +} + +int +init(xlator_t *this) +{ + ob_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: volume (%s) not configured with exactly one " + "child", + this->name); + return -1; + } + + if (!this->parents) + gf_msg(this->name, GF_LOG_WARNING, 0, OPEN_BEHIND_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + + conf = GF_CALLOC(1, sizeof(*conf), gf_ob_mt_conf_t); + if (!conf) + goto err; + + GF_OPTION_INIT("use-anonymous-fd", conf->use_anonymous_fd, bool, err); + + GF_OPTION_INIT("lazy-open", conf->lazy_open, bool, err); + + GF_OPTION_INIT("read-after-open", conf->read_after_open, bool, err); + + GF_OPTION_INIT("pass-through", this->pass_through, bool, err); + + this->private = conf; + + return 0; +err: + if (conf) + GF_FREE(conf); + + return -1; +} + +void +fini(xlator_t *this) +{ + ob_conf_t *conf = NULL; + + conf = this->private; + + GF_FREE(conf); + + return; +} + +struct xlator_fops fops = { + .open = ob_open, + .create = ob_create, + .readv = ob_readv, + .writev = ob_writev, + .flush = ob_flush, + .fsync = ob_fsync, + .fstat = ob_fstat, + .seek = ob_seek, + .ftruncate = ob_ftruncate, + .fsetxattr = ob_fsetxattr, + .setxattr = ob_setxattr, + .fgetxattr = ob_fgetxattr, + .fremovexattr = ob_fremovexattr, + .finodelk = ob_finodelk, + .fentrylk = ob_fentrylk, + .fxattrop = ob_fxattrop, + .fsetattr = ob_fsetattr, + .setattr = ob_setattr, + .fallocate = ob_fallocate, + .discard = ob_discard, + .zerofill = ob_zerofill, + .unlink = ob_unlink, + .rename = ob_rename, + .lk = ob_lk, +}; + +struct xlator_cbks cbks = { + .fdclose = ob_fdclose, + .forget = ob_forget, +}; + +struct xlator_dumpops dumpops = { + .priv = ob_priv_dump, + .fdctx = ob_fdctx_dump, +}; + +struct volume_options options[] = { + { + .key = {"open-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable open-behind", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"use-anonymous-fd"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = + "For read operations, use anonymous FD when " + "original FD is open-behind and not yet opened in the backend.", + }, + { + .key = {"lazy-open"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = + "Perform open in the backend only when a necessary " + "FOP arrives (e.g writev on the FD, unlink of the file). When " + "option " + "is disabled, perform backend open right after unwinding open().", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {}, + /* option_validation_fn validate_fn; */ + }, + { + .key = {"read-after-open"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes", + .description = "read is sent only after actual open happens and real " + "fd is obtained, instead of doing on anonymous fd " + "(similar to write)", + .op_version = {3}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {}, + /* option_validation_fn validate_fn; */ + }, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"open-behind"}, + .description = "Enable/Disable open behind translator"}, + {.key = {NULL}} + +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "open-behind", + .category = GF_MAINTAINED, +}; diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am index db917f897c8..8eb6cece738 100644 --- a/xlators/performance/quick-read/src/Makefile.am +++ b/xlators/performance/quick-read/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = quick-read.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -quick_read_la_LDFLAGS = -module -avoidversion +quick_read_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) quick_read_la_SOURCES = quick-read.c quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = quick-read.h quick-read-mem-types.h +noinst_HEADERS = quick-read.h quick-read-mem-types.h quick-read-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/quick-read/src/quick-read-mem-types.h b/xlators/performance/quick-read/src/quick-read-mem-types.h index b6a65e57cb0..e4aef8549ff 100644 --- a/xlators/performance/quick-read/src/quick-read-mem-types.h +++ b/xlators/performance/quick-read/src/quick-read-mem-types.h @@ -1,35 +1,23 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __QR_MEM_TYPES_H__ #define __QR_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_qr_mem_types_ { - gf_qr_mt_qr_inode_t = gf_common_mt_end + 1, - gf_qr_mt_qr_fd_ctx_t, - gf_qr_mt_qr_local_t, - gf_qr_mt_iovec, - gf_qr_mt_qr_conf_t, - gf_qr_mt_qr_priority_t, - gf_qr_mt_qr_private_t, - gf_qr_mt_end + gf_qr_mt_qr_inode_t = gf_common_mt_end + 1, + gf_qr_mt_content_t, + gf_qr_mt_qr_priority_t, + gf_qr_mt_qr_private_t, + gf_qr_mt_end }; #endif diff --git a/xlators/performance/quick-read/src/quick-read-messages.h b/xlators/performance/quick-read/src/quick-read-messages.h new file mode 100644 index 00000000000..da9724a3c9c --- /dev/null +++ b/xlators/performance/quick-read/src/quick-read-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _QUICK_READ_MESSAGES_H_ +#define _QUICK_READ_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(QUICK_READ, QUICK_READ_MSG_ENFORCEMENT_FAILED, + QUICK_READ_MSG_INVALID_ARGUMENT, + QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED, QUICK_READ_MSG_NO_MEMORY, + QUICK_READ_MSG_VOL_MISCONFIGURED, QUICK_READ_MSG_DICT_SET_FAILED, + QUICK_READ_MSG_INVALID_CONFIG, QUICK_READ_MSG_LRU_NOT_EMPTY); + +#endif /* _QUICK_READ_MESSAGES_H_ */ diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c index 6c9a0f0e5b5..7fe4b3c3a4b 100644 --- a/xlators/performance/quick-read/src/quick-read.c +++ b/xlators/performance/quick-read/src/quick-read.c @@ -1,3659 +1,1644 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ +#include <math.h> #include "quick-read.h" -#include "statedump.h" +#include <glusterfs/statedump.h> +#include "quick-read-messages.h" +#include <glusterfs/upcall-utils.h> +#include <glusterfs/atomic.h> -#define QR_DEFAULT_CACHE_SIZE 134217728 +typedef struct qr_local { + inode_t *inode; + uint64_t incident_gen; + fd_t *fd; +} qr_local_t; -struct volume_options options[]; +qr_inode_t * +qr_inode_ctx_get(xlator_t *this, inode_t *inode); void -qr_local_free (qr_local_t *local) -{ - if (local == NULL) { - goto out; - } +__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode); - if (local->stub != NULL) { - call_stub_destroy (local->stub); - } +void +qr_local_wipe(qr_local_t *local) +{ + if (!local) + goto out; - if (local->path != NULL) { - GF_FREE (local->path); - } + if (local->inode) + inode_unref(local->inode); - GF_FREE (local); + if (local->fd) + fd_unref(local->fd); + GF_FREE(local); out: - return; + return; } +uint64_t +__qr_get_generation(xlator_t *this, qr_inode_t *qr_inode) +{ + uint64_t gen = 0, rollover; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; -int32_t -qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset); + priv = this->private; + table = &priv->table; + gen = GF_ATOMIC_INC(priv->generation); + if (gen == 0) { + qr_inode->gen_rollover = !qr_inode->gen_rollover; + gen = GF_ATOMIC_INC(priv->generation); + __qr_inode_prune_data(this, table, qr_inode); + qr_inode->gen = qr_inode->invalidation_time = gen - 1; + } -static void -qr_loc_wipe (loc_t *loc) + rollover = qr_inode->gen_rollover; + gen |= (rollover << 32); + return gen; +} + +uint64_t +qr_get_generation(xlator_t *this, inode_t *inode) { - if (loc == NULL) { - goto out; - } + qr_inode_t *qr_inode = NULL; + uint64_t gen = 0; + qr_inode_table_t *table = NULL; + qr_private_t *priv = NULL; - if (loc->path) { - GF_FREE ((char *)loc->path); - loc->path = NULL; - } + priv = this->private; + table = &priv->table; - if (loc->inode) { - inode_unref (loc->inode); - loc->inode = NULL; - } + qr_inode = qr_inode_ctx_get(this, inode); - if (loc->parent) { - inode_unref (loc->parent); - loc->parent = NULL; + if (qr_inode) { + LOCK(&table->lock); + { + gen = __qr_get_generation(this, qr_inode); + } + UNLOCK(&table->lock); + } else { + gen = GF_ATOMIC_INC(priv->generation); + if (gen == 0) { + gen = GF_ATOMIC_INC(priv->generation); } + } -out: - return; + return gen; } - -static int32_t -qr_loc_fill (loc_t *loc, inode_t *inode, char *path) +qr_local_t * +qr_local_get(xlator_t *this, inode_t *inode) { - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", loc, out, errno, EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", inode, out, errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", path, out, errno, EINVAL); + qr_local_t *local = NULL; - loc->inode = inode_ref (inode); - uuid_copy (loc->gfid, inode->gfid); + local = GF_CALLOC(1, sizeof(*local), gf_common_mt_char); + if (!local) + goto out; - loc->path = gf_strdup (path); - if (!loc->path) - goto out; - - ret = 0; + local->incident_gen = qr_get_generation(this, inode); out: - if (ret == -1) { - qr_loc_wipe (loc); - } - - return ret; + return local; } +#define QR_STACK_UNWIND(fop, frame, params...) \ + do { \ + qr_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + qr_local_wipe(__local); \ + } while (0) void -qr_resume_pending_ops (qr_fd_ctx_t *qr_fd_ctx, int32_t op_ret, int32_t op_errno) -{ - call_stub_t *stub = NULL, *tmp = NULL; - struct list_head waiting_ops = {0, }; +__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode, + uint64_t gen); - GF_VALIDATE_OR_GOTO ("quick-read", qr_fd_ctx, out); - - INIT_LIST_HEAD (&waiting_ops); - - LOCK (&qr_fd_ctx->lock); - { - qr_fd_ctx->open_in_transit = 0; - list_splice_init (&qr_fd_ctx->waiting_ops, - &waiting_ops); - } - UNLOCK (&qr_fd_ctx->lock); - - if (!list_empty (&waiting_ops)) { - list_for_each_entry_safe (stub, tmp, &waiting_ops, list) { - list_del_init (&stub->list); - if (op_ret < 0) { - qr_local_t *local = NULL; +int +__qr_inode_ctx_set(xlator_t *this, inode_t *inode, qr_inode_t *qr_inode) +{ + uint64_t value = 0; + int ret = -1; - local = stub->frame->local; - local->op_ret = op_ret; - local->op_errno = op_errno; - } + value = (long)qr_inode; - call_resume (stub); - } - } + ret = __inode_ctx_set(inode, this, &value); -out: - return; + return ret; } - -static void -qr_fd_ctx_free (qr_fd_ctx_t *qr_fd_ctx) +qr_inode_t * +__qr_inode_ctx_get(xlator_t *this, inode_t *inode) { - GF_VALIDATE_OR_GOTO ("quick-read", qr_fd_ctx, out); + qr_inode_t *qr_inode = NULL; + uint64_t value = 0; + int ret = -1; - GF_ASSERT (list_empty (&qr_fd_ctx->waiting_ops)); + ret = __inode_ctx_get(inode, this, &value); + if (ret) + return NULL; - GF_FREE (qr_fd_ctx->path); - GF_FREE (qr_fd_ctx); + qr_inode = (void *)((long)value); -out: - return; + return qr_inode; } - -static inline uint32_t -is_match (const char *path, const char *pattern) +qr_inode_t * +qr_inode_ctx_get(xlator_t *this, inode_t *inode) { - int32_t ret = 0; - uint32_t match = 0; + qr_inode_t *qr_inode = NULL; - GF_VALIDATE_OR_GOTO ("quick-read", path, out); - GF_VALIDATE_OR_GOTO ("quick-read", pattern, out); + if (inode == NULL) + goto out; - ret = fnmatch (pattern, path, FNM_NOESCAPE); - match = (ret == 0); + LOCK(&inode->lock); + { + qr_inode = __qr_inode_ctx_get(this, inode); + } + UNLOCK(&inode->lock); out: - return match; + return qr_inode; } - -uint32_t -qr_get_priority (qr_conf_t *conf, const char *path) +qr_inode_t * +qr_inode_new(xlator_t *this, inode_t *inode) { - uint32_t priority = 0; - struct qr_priority *curr = NULL; + qr_inode_t *qr_inode = NULL; - GF_VALIDATE_OR_GOTO ("quick-read", conf, out); - GF_VALIDATE_OR_GOTO ("quick-read", path, out); + qr_inode = GF_CALLOC(1, sizeof(*qr_inode), gf_qr_mt_qr_inode_t); + if (!qr_inode) + return NULL; - list_for_each_entry (curr, &conf->priority_list, list) { - if (is_match (path, curr->pattern)) - priority = curr->priority; - } + INIT_LIST_HEAD(&qr_inode->lru); -out: - return priority; -} + qr_inode->priority = 0; /* initial priority */ + return qr_inode; +} -/* To be called with this-priv->table.lock held */ qr_inode_t * -__qr_inode_alloc (xlator_t *this, char *path, inode_t *inode) +qr_inode_ctx_get_or_new(xlator_t *this, inode_t *inode) { - qr_inode_t *qr_inode = NULL; - qr_private_t *priv = NULL; - int priority = 0; + qr_inode_t *qr_inode = NULL; + int ret = -1; + qr_private_t *priv = NULL; - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - GF_VALIDATE_OR_GOTO (this->name, path, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + priv = this->private; - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); + LOCK(&inode->lock); + { + qr_inode = __qr_inode_ctx_get(this, inode); + if (qr_inode) + goto unlock; - qr_inode = GF_CALLOC (1, sizeof (*qr_inode), gf_qr_mt_qr_inode_t); - if (qr_inode == NULL) { - goto out; - } + qr_inode = qr_inode_new(this, inode); + if (!qr_inode) + goto unlock; - INIT_LIST_HEAD (&qr_inode->lru); - - priority = qr_get_priority (&priv->conf, path); - - list_add_tail (&qr_inode->lru, &priv->table.lru[priority]); + ret = __qr_inode_ctx_set(this, inode, qr_inode); + if (ret) { + __qr_inode_prune(this, &priv->table, qr_inode, 0); + GF_FREE(qr_inode); + qr_inode = NULL; + } + } +unlock: + UNLOCK(&inode->lock); - qr_inode->inode = inode; - qr_inode->priority = priority; -out: - return qr_inode; + return qr_inode; } - -/* To be called with qr_inode->table->lock held */ -void -__qr_inode_free (qr_inode_t *qr_inode) +uint32_t +qr_get_priority(qr_conf_t *conf, const char *path) { - GF_VALIDATE_OR_GOTO ("quick-read", qr_inode, out); + uint32_t priority = 0; + struct qr_priority *curr = NULL; - if (qr_inode->xattr) { - dict_unref (qr_inode->xattr); - } - - list_del (&qr_inode->lru); + list_for_each_entry(curr, &conf->priority_list, list) + { + if (fnmatch(curr->pattern, path, FNM_NOESCAPE) == 0) + priority = curr->priority; + } - GF_FREE (qr_inode); -out: - return; + return priority; } -/* To be called with priv->table.lock held */ void -__qr_cache_prune (xlator_t *this) +__qr_inode_register(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode) { - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - qr_inode_table_t *table = NULL; - qr_inode_t *curr = NULL, *next = NULL; - int32_t index = 0; - uint64_t size_to_prune = 0; - uint64_t size_pruned = 0; - - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - - table = &priv->table; - conf = &priv->conf; - - size_to_prune = table->cache_used - conf->cache_size; - - for (index=0; index < conf->max_pri; index++) { - list_for_each_entry_safe (curr, next, &table->lru[index], lru) { - size_pruned += curr->stbuf.ia_size; - inode_ctx_del (curr->inode, this, NULL); - __qr_inode_free (curr); - if (size_pruned >= size_to_prune) - goto done; - } - } + qr_private_t *priv = NULL; -done: - table->cache_used -= size_pruned; + if (!qr_inode->data) + return; -out: + priv = this->private; + if (!priv) return; -} -/* To be called with table->lock held */ -inline char -__qr_need_cache_prune (qr_conf_t *conf, qr_inode_table_t *table) -{ - char need_prune = 0; + if (list_empty(&qr_inode->lru)) + /* first time addition of this qr_inode into table */ + table->cache_used += qr_inode->size; + else + list_del_init(&qr_inode->lru); - GF_VALIDATE_OR_GOTO ("quick-read", conf, out); - GF_VALIDATE_OR_GOTO ("quick-read", table, out); + list_add_tail(&qr_inode->lru, &table->lru[qr_inode->priority]); - need_prune = (table->cache_used > conf->cache_size); + GF_ATOMIC_INC(priv->qr_counter.files_cached); -out: - return need_prune; + return; } - -int32_t -qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) +void +qr_inode_set_priority(xlator_t *this, inode_t *inode, const char *path) { - data_t *content = NULL; - qr_inode_t *qr_inode = NULL; - uint64_t value = 0; - int ret = -1; - qr_conf_t *conf = NULL; - qr_inode_table_t *table = NULL; - qr_private_t *priv = NULL; - qr_local_t *local = NULL; - - GF_ASSERT (frame); - - if ((op_ret == -1) || (dict == NULL)) { - goto out; - } - - if ((this == NULL) || (this->private == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "quick-read configuration is not found"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + uint32_t priority = 0; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; - priv = this->private; - conf = &priv->conf; - table = &priv->table; - - local = frame->local; - - if (buf->ia_size > conf->max_file_size) { - goto out; - } - - if (IA_ISDIR (buf->ia_type)) { - goto out; - } + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + return; - if (inode == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "lookup returned a NULL inode"); - goto out; - } + priv = this->private; + table = &priv->table; + conf = &priv->conf; - content = dict_get (dict, GF_CONTENT_KEY); - if (content == NULL) { - goto out; - } + if (path) + priority = qr_get_priority(conf, path); + else + /* retain existing priority, just bump LRU */ + priority = qr_inode->priority; - LOCK (&table->lock); - { - ret = inode_ctx_get (inode, this, &value); - if (ret == -1) { - qr_inode = __qr_inode_alloc (this, local->path, inode); - if (qr_inode == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unlock; - } - - ret = inode_ctx_put (inode, this, - (uint64_t)(long)qr_inode); - if (ret == -1) { - __qr_inode_free (qr_inode); - qr_inode = NULL; - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "cannot set quick-read context in " - "inode (gfid:%s)", - uuid_utoa (inode->gfid)); - goto unlock; - } - } else { - qr_inode = (qr_inode_t *)(long)value; - if (qr_inode == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "cannot find quick-read context in " - "inode (gfid:%s)", - uuid_utoa (inode->gfid)); - goto unlock; - } - } - - if (qr_inode->xattr) { - dict_unref (qr_inode->xattr); - qr_inode->xattr = NULL; - table->cache_used -= qr_inode->stbuf.ia_size; - } - - qr_inode->xattr = dict_ref (dict); - qr_inode->stbuf = *buf; - table->cache_used += buf->ia_size; - - gettimeofday (&qr_inode->tv, NULL); - if (__qr_need_cache_prune (conf, table)) { - __qr_cache_prune (this); - } - } -unlock: - UNLOCK (&table->lock); - -out: - /* - * FIXME: content size in dict can be greater than the size application - * requested for. Applications need to be careful till this is fixed. - */ - QR_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, dict, - postparent); + LOCK(&table->lock); + { + qr_inode->priority = priority; - return 0; + __qr_inode_register(this, table, qr_inode); + } + UNLOCK(&table->lock); } - -int32_t -qr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +void +__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table, + qr_inode_t *qr_inode) { - qr_conf_t *conf = NULL; - dict_t *new_req_dict = NULL; - int32_t op_ret = -1, op_errno = EINVAL; - data_t *content = NULL; - uint64_t requested_size = 0, size = 0, value = 0; - char cached = 0; - qr_inode_t *qr_inode = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - qr_local_t *local = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - - priv = this->private; - GF_VALIDATE_OR_GOTO (frame->this->name, priv, unwind); - - conf = &priv->conf; - if (conf == NULL) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } + qr_private_t *priv = NULL; - table = &priv->table; - local = GF_CALLOC (1, sizeof (*local), gf_qr_mt_qr_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, op_errno, - ENOMEM); + priv = this->private; - frame->local = local; + GF_FREE(qr_inode->data); + qr_inode->data = NULL; - local->path = gf_strdup (loc->path); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, op_errno, - ENOMEM); - LOCK (&table->lock); - { - op_ret = inode_ctx_get (loc->inode, this, &value); - if (op_ret == 0) { - qr_inode = (qr_inode_t *)(long)value; - if (qr_inode != NULL) { - if (qr_inode->xattr) { - cached = 1; - } - } - } - } - UNLOCK (&table->lock); - - if ((xattr_req == NULL) && (conf->max_file_size > 0)) { - new_req_dict = xattr_req = dict_new (); - if (xattr_req == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - } + if (!list_empty(&qr_inode->lru)) { + table->cache_used -= qr_inode->size; + qr_inode->size = 0; - if (!cached) { - if (xattr_req) { - content = dict_get (xattr_req, GF_CONTENT_KEY); - if (content) { - requested_size = data_to_uint64 (content); - } - } - - if ((conf->max_file_size > 0) - && (conf->max_file_size != requested_size)) { - size = (conf->max_file_size > requested_size) ? - conf->max_file_size : requested_size; - - op_ret = dict_set (xattr_req, GF_CONTENT_KEY, - data_from_uint64 (size)); - if (op_ret < 0) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot set key in request dict to " - "request file " - "content during lookup cbk"); - goto unwind; - } - } - } + list_del_init(&qr_inode->lru); - STACK_WIND (frame, qr_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + GF_ATOMIC_DEC(priv->qr_counter.files_cached); + } - if (new_req_dict) { - dict_unref (new_req_dict); - } - - return 0; - -unwind: - QR_STACK_UNWIND (lookup, frame, op_ret, op_errno, NULL, NULL, NULL, - NULL); - - if (new_req_dict) { - dict_unref (new_req_dict); - } - - return 0; + memset(&qr_inode->buf, 0, sizeof(qr_inode->buf)); } - -int32_t -qr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) +/* To be called with priv->table.lock held */ +void +__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode, + uint64_t gen) { - uint64_t value = 0; - int32_t ret = -1; - qr_local_t *local = NULL; - qr_inode_t *qr_inode = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - call_stub_t *stub = NULL, *tmp = NULL; - char is_open = 0; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - struct list_head waiting_ops; - - GF_ASSERT (frame); - - priv = this->private; - table = &priv->table; - - local = frame->local; - if (local != NULL) { - is_open = local->is_open; - } - - INIT_LIST_HEAD (&waiting_ops); - - ret = fd_ctx_get (fd, this, &value); - if ((ret == -1) && (op_ret != -1)) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "cannot find quick-read context in fd (%p) opened on " - "inode (gfid: %s)", fd, uuid_utoa (fd->inode->gfid)); - goto out; - } - - if (value) { - qr_fd_ctx = (qr_fd_ctx_t *) (long)value; - } - - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - qr_fd_ctx->open_in_transit = 0; - - if (op_ret == 0) { - qr_fd_ctx->opened = 1; - } - list_splice_init (&qr_fd_ctx->waiting_ops, - &waiting_ops); - } - UNLOCK (&qr_fd_ctx->lock); - - if (local && local->is_open - && ((local->open_flags & O_TRUNC) == O_TRUNC)) { - LOCK (&table->lock); - { - ret = inode_ctx_del (fd->inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long) value; - - if (qr_inode != NULL) { - __qr_inode_free (qr_inode); - } - } - } - UNLOCK (&table->lock); - } - - if (!list_empty (&waiting_ops)) { - list_for_each_entry_safe (stub, tmp, &waiting_ops, - list) { - list_del_init (&stub->list); - if (op_ret < 0) { - qr_local_t *local = NULL; - - local = stub->frame->local; - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - call_resume (stub); - } - } - } -out: - if (is_open) { - QR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } else { - STACK_DESTROY (frame->root); - } - - return 0; + __qr_inode_prune_data(this, table, qr_inode); + if (gen) + qr_inode->gen = gen; + qr_inode->invalidation_time = __qr_get_generation(this, qr_inode); } - -int32_t -qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +void +qr_inode_prune(xlator_t *this, inode_t *inode, uint64_t gen) { - qr_inode_t *qr_inode = NULL; - int32_t ret = -1; - uint64_t filep = 0; - char content_cached = 0; - qr_fd_ctx_t *qr_fd_ctx = NULL, *tmp_fd_ctx = NULL; - int32_t op_ret = -1, op_errno = EINVAL; - qr_local_t *local = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this->private, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - priv = this->private; - table = &priv->table; - - tmp_fd_ctx = qr_fd_ctx = GF_CALLOC (1, sizeof (*qr_fd_ctx), - gf_qr_mt_qr_fd_ctx_t); - if (qr_fd_ctx == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - LOCK_INIT (&qr_fd_ctx->lock); - INIT_LIST_HEAD (&qr_fd_ctx->waiting_ops); + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_inode_t *qr_inode = NULL; - qr_fd_ctx->path = gf_strdup (loc->path); - if (qr_fd_ctx->path == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + return; - qr_fd_ctx->flags = flags; - qr_fd_ctx->wbflags = wbflags; - - ret = fd_ctx_set (fd, this, (uint64_t)(long)qr_fd_ctx); - if (ret == -1) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "cannot set quick-read context in " - "fd (%p) opened on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } + priv = this->private; + table = &priv->table; - tmp_fd_ctx = NULL; + LOCK(&table->lock); + { + __qr_inode_prune(this, table, qr_inode, gen); + } + UNLOCK(&table->lock); +} - local = GF_CALLOC (1, sizeof (*local), gf_qr_mt_qr_local_t); - if (local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } +/* To be called with priv->table.lock held */ +void +__qr_cache_prune(xlator_t *this, qr_inode_table_t *table, qr_conf_t *conf) +{ + qr_inode_t *curr = NULL; + qr_inode_t *next = NULL; + int index = 0; + size_t size_pruned = 0; - local->is_open = 1; - local->open_flags = flags; - frame->local = local; - LOCK (&table->lock); + for (index = 0; index < conf->max_pri; index++) { + list_for_each_entry_safe(curr, next, &table->lru[index], lru) { - ret = inode_ctx_get (fd->inode, this, &filep); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long) filep; - if (qr_inode) { - if (qr_inode->xattr) { - content_cached = 1; - } - } - } - } - UNLOCK (&table->lock); - - if (content_cached && (flags & O_DIRECTORY)) { - op_ret = -1; - op_errno = ENOTDIR; - gf_log (this->name, GF_LOG_WARNING, - "open with O_DIRECTORY flag received on non-directory"); - goto unwind; - } + size_pruned += curr->size; - if (!content_cached || ((flags & O_ACCMODE) == O_WRONLY) - || ((flags & O_TRUNC) == O_TRUNC) - || ((flags & O_DIRECT) == O_DIRECT) - || ((wbflags & GF_OPEN_NOWB) != 0)) { - LOCK (&qr_fd_ctx->lock); - { - /* - * we really need not set this flag, since open is - * not yet unwound. - */ - - qr_fd_ctx->open_in_transit = 1; - if (((flags & O_DIRECT) == O_DIRECT) - || ((wbflags & GF_OPEN_NOWB)) != 0) { - qr_fd_ctx->disabled = 1; - } - } - UNLOCK (&qr_fd_ctx->lock); - goto wind; - } else { - op_ret = 0; - op_errno = 0; - goto unwind; - } + __qr_inode_prune(this, table, curr, 0); -unwind: - if (tmp_fd_ctx != NULL) { - qr_fd_ctx_free (tmp_fd_ctx); + if (table->cache_used < conf->cache_size) + return; } + } - QR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - return 0; - -wind: - STACK_WIND (frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, wbflags); - return 0; + return; } - -static inline time_t -qr_time_elapsed (struct timeval *now, struct timeval *then) +void +qr_cache_prune(xlator_t *this) { - time_t time_elapsed = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_table_t *table = NULL; - GF_VALIDATE_OR_GOTO ("quick-read", now, out); - GF_VALIDATE_OR_GOTO ("quick-read", then, out); + priv = this->private; + table = &priv->table; + conf = &priv->conf; - time_elapsed = now->tv_sec - then->tv_sec; - -out: - return time_elapsed; + LOCK(&table->lock); + { + if (table->cache_used > conf->cache_size) + __qr_cache_prune(this, table, conf); + } + UNLOCK(&table->lock); } - -static inline char -qr_need_validation (qr_conf_t *conf, qr_inode_t *qr_inode) +void * +qr_content_extract(dict_t *xdata) { - struct timeval now = {0, }; - char need_validation = 0; + data_t *data = NULL; + void *content = NULL; + int ret = 0; - GF_VALIDATE_OR_GOTO ("quick-read", conf, out); - GF_VALIDATE_OR_GOTO ("quick-read", qr_inode, out); + ret = dict_get_with_ref(xdata, GF_CONTENT_KEY, &data); + if (ret < 0 || !data) + return NULL; - gettimeofday (&now, NULL); + content = GF_MALLOC(data->len, gf_qr_mt_content_t); + if (!content) + goto out; - if (qr_time_elapsed (&now, &qr_inode->tv) >= conf->cache_timeout) - need_validation = 1; + memcpy(content, data->data, data->len); out: - return need_validation; + data_unref(data); + return content; } - -static int32_t -qr_validate_cache_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +void +qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data, + struct iatt *buf, uint64_t gen) { - qr_inode_t *qr_inode = NULL; - qr_local_t *local = NULL; - uint64_t value = 0; - int32_t ret = 0; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - call_stub_t *stub = NULL; - - GF_ASSERT (frame); - if (this == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (frame->this->name, GF_LOG_WARNING, - "xlator object (this) is NULL"); - goto unwind; - } + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + uint32_t rollover = 0; - local = frame->local; - if ((local == NULL) || ((local->fd) == NULL)) { - op_ret = -1; - op_errno = EINVAL; - gf_log (frame->this->name, GF_LOG_WARNING, - (local == NULL) ? "local is NULL" - : "fd is not stored in local"); - goto unwind; - } - - local->just_validated = 1; + rollover = gen >> 32; + gen = gen & 0xffffffff; - if (op_ret == -1) { - goto unwind; - } + priv = this->private; + table = &priv->table; - priv = this->private; - table = &priv->table; + LOCK(&table->lock); + { + if ((rollover != qr_inode->gen_rollover) || + (gen && qr_inode->gen && (qr_inode->gen >= gen))) + goto unlock; - LOCK (&table->lock); - { - ret = inode_ctx_get (local->fd->inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long) value; - } - - if (qr_inode != NULL) { - gettimeofday (&qr_inode->tv, NULL); - - if ((qr_inode->stbuf.ia_mtime != buf->ia_mtime) - || (qr_inode->stbuf.ia_mtime_nsec - != buf->ia_mtime_nsec)) { - inode_ctx_del (local->fd->inode, this, NULL); - __qr_inode_free (qr_inode); - } - } - } - UNLOCK (&table->lock); - - stub = local->stub; - local->stub = NULL; - - call_resume (stub); - - return 0; + if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen)) + goto unlock; -unwind: - /* this is actually unwind of readv */ - QR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, -1, NULL, NULL); - return 0; -} + __qr_inode_prune(this, table, qr_inode, gen); + qr_inode->data = data; + data = NULL; + qr_inode->size = buf->ia_size; -int32_t -qr_validate_cache_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - qr_local_t *local = NULL; - int32_t op_ret = -1, op_errno = -1; + qr_inode->ia_mtime = buf->ia_mtime; + qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec; + qr_inode->ia_ctime = buf->ia_ctime; + qr_inode->ia_ctime_nsec = buf->ia_ctime_nsec; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, out); + qr_inode->buf = *buf; + qr_inode->last_refresh = gf_time(); - local = frame->local; - if (local == NULL) { - op_ret = -1; - op_errno = EINVAL; - } else { - op_ret = local->op_ret; - op_errno = local->op_errno; - } + __qr_inode_register(this, table, qr_inode); + } +unlock: + UNLOCK(&table->lock); -out: - if (op_ret == -1) { - qr_validate_cache_cbk (frame, NULL, this, op_ret, op_errno, - NULL); - } else { - STACK_WIND (frame, qr_validate_cache_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd); - } + if (data) + GF_FREE(data); - return 0; + qr_cache_prune(this); } - -int -qr_validate_cache (call_frame_t *frame, xlator_t *this, fd_t *fd, - call_stub_t *stub) +gf_boolean_t +qr_size_fits(qr_conf_t *conf, struct iatt *buf) { - int ret = -1; - int flags = 0; - uint64_t value = 0; - loc_t loc = {0, }; - char *path = NULL; - qr_local_t *local = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - call_stub_t *validate_stub = NULL; - char need_open = 0, can_wind = 0, validate_cbk_called = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, out); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, out); - GF_VALIDATE_OR_GOTO (frame->this->name, stub, out); - - if (frame->local == NULL) { - local = GF_CALLOC (1, sizeof (*local), gf_qr_mt_qr_local_t); - if (local == NULL) { - goto out; - } - } else { - local = frame->local; - } - - local->fd = fd; - local->stub = stub; - frame->local = local; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - } - - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - validate_stub = fop_fstat_stub (frame, - qr_validate_cache_helper, - fd); - if (validate_stub == NULL) { - ret = -1; - if (need_open) { - qr_fd_ctx->open_in_transit = 0; - } - goto unlock; - } - - list_add_tail (&validate_stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - - if (ret == -1) { - goto out; - } - } else { - can_wind = 1; - } - - if (need_open) { - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - validate_cbk_called = 1; - goto out; - } - - ret = qr_loc_fill (&loc, fd->inode, path); - if (ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - validate_cbk_called = 1; - STACK_DESTROY (open_frame->root); - goto out; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - &loc, flags, fd, qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } else if (can_wind) { - STACK_WIND (frame, qr_validate_cache_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd); - } - - ret = 0; -out: - if ((ret < 0) && !validate_cbk_called) { - if (frame->local == NULL) { - call_stub_destroy (stub); - } - - qr_validate_cache_cbk (frame, NULL, this, -1, errno, NULL); - } - return ret; + return (buf->ia_size <= conf->max_file_size); } - -int32_t -qr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobref) +gf_boolean_t +qr_mtime_equal(qr_inode_t *qr_inode, struct iatt *buf) { - GF_ASSERT (frame); - - QR_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); - return 0; + return (qr_inode->ia_mtime == buf->ia_mtime && + qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec); } - -int32_t -qr_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +gf_boolean_t +qr_ctime_equal(qr_inode_t *qr_inode, struct iatt *buf) { - qr_local_t *local = NULL; - int32_t op_errno = EINVAL, ret = 0; - uint64_t value = 0; - qr_fd_ctx_t *fdctx = NULL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding read call", - fdctx ? fdctx->path : NULL, strerror (errno)); - goto unwind; - } - - STACK_WIND (frame, qr_readv_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readv, fd, size, offset); - return 0; - -unwind: - QR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - return 0; + return (qr_inode->ia_ctime == buf->ia_ctime && + qr_inode->ia_ctime_nsec == buf->ia_ctime_nsec); } - -int32_t -qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +gf_boolean_t +qr_time_equal(qr_conf_t *conf, qr_inode_t *qr_inode, struct iatt *buf) { - qr_inode_t *qr_inode = NULL; - int32_t ret = -1, op_ret = -1, op_errno = -1; - uint64_t value = 0; - int count = -1, flags = 0, i = 0; - char content_cached = 0, need_validation = 0; - char need_open = 0, can_wind = 0, need_unwind = 0; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - struct iatt stbuf = {0, }; - data_t *content = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - call_stub_t *stub = NULL; - loc_t loc = {0, }; - qr_conf_t *conf = NULL; - struct iovec *vector = NULL; - char *path = NULL; - off_t start = 0, end = 0; - size_t len = 0; - struct iobuf_pool *iobuf_pool = NULL; - qr_local_t *local = NULL; - char just_validated = 0; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - call_frame_t *open_frame = NULL; - - op_ret = 0; - - priv = this->private; - conf = &priv->conf; - table = &priv->table; - - local = frame->local; - - if (local != NULL) { - just_validated = local->just_validated; - } - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - if (qr_fd_ctx != NULL) { - if (qr_fd_ctx->disabled) { - goto out; - } - } - } - - iobuf_pool = this->ctx->iobuf_pool; + if (conf->ctime_invalidation) + return qr_ctime_equal(qr_inode, buf); + else + return qr_mtime_equal(qr_inode, buf); +} - LOCK (&table->lock); - { - ret = inode_ctx_get (fd->inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long)value; - if (qr_inode) { - if (qr_inode->xattr){ - if (!just_validated - && qr_need_validation (conf, - qr_inode)) { - need_validation = 1; - goto unlock; - } - - content = dict_get (qr_inode->xattr, - GF_CONTENT_KEY); - - stbuf = qr_inode->stbuf; - content_cached = 1; - list_move_tail (&qr_inode->lru, - &table->lru[qr_inode->priority]); - - if (offset > content->len) { - op_ret = 0; - end = content->len; - } else { - if ((offset + size) - > content->len) { - op_ret = content->len - - offset; - end = content->len; - } else { - op_ret = size; - end = offset + size; - } - } - - count = (op_ret - / iobuf_pool->default_page_size); - if ((op_ret % iobuf_pool->default_page_size) - != 0) { - count++; - } - - if (count == 0) { - op_ret = 0; - goto unlock; - } - - vector = GF_CALLOC (count, - sizeof (*vector), - gf_qr_mt_iovec); - if (vector == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - goto unlock; - } - - iobref = iobref_new (); - if (iobref == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - goto unlock; - } - - for (i = 0; i < count; i++) { - iobuf = iobuf_get (iobuf_pool); - if (iobuf == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - goto unlock; - } - - start = offset - + - (iobuf_pool->default_page_size - * i); - - if (start > end) { - len = 0; - } else { - len = - (iobuf_pool->default_page_size - > (end - start)) - ? (end - start) - : - iobuf_pool->default_page_size; - - memcpy (iobuf->ptr, - content->data - + start, - len); - } - - iobref_add (iobref, iobuf); - iobuf_unref (iobuf); - - vector[i].iov_base = iobuf->ptr; - vector[i].iov_len = len; - } - } - } - } - } -unlock: - UNLOCK (&table->lock); +void +__qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf, + uint64_t gen) +{ + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + qr_conf_t *conf = NULL; + uint32_t rollover = 0; -out: - if (content_cached || need_unwind) { - QR_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, - count, &stbuf, iobref); - - } else if (need_validation) { - stub = fop_readv_stub (frame, qr_readv, fd, size, offset); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - qr_validate_cache (frame, this, fd, stub); - } else { - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - if (frame->local == NULL) { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto fdctx_unlock; - } - } - - stub = fop_readv_stub (frame, - qr_readv_helper, - fd, size, - offset); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto fdctx_unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - fdctx_unlock: - UNLOCK (&qr_fd_ctx->lock); - - if (op_ret == -1) { - need_unwind = 1; - goto out; - } - } else { - can_wind = 1; - } - - if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, - &loc, flags, fd, qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } else if (can_wind) { - STACK_WIND (frame, qr_readv_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->readv, fd, size, - offset); - } + rollover = gen >> 32; + gen = gen & 0xffffffff; - } + priv = this->private; + table = &priv->table; + conf = &priv->conf; -ret: - if (vector) { - GF_FREE (vector); - } + /* allow for rollover of frame->root->unique */ + if ((rollover != qr_inode->gen_rollover) || + (gen && qr_inode->gen && (qr_inode->gen >= gen))) + goto done; - if (iobref) { - iobref_unref (iobref); - } + if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen)) + goto done; - return 0; -} + qr_inode->gen = gen; + if (qr_size_fits(conf, buf) && qr_time_equal(conf, qr_inode, buf)) { + qr_inode->buf = *buf; + qr_inode->last_refresh = gf_time(); + __qr_inode_register(this, table, qr_inode); + } else { + __qr_inode_prune(this, table, qr_inode, gen); + } -int32_t -qr_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; +done: + return; } - -int32_t -qr_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t off, - struct iobref *iobref) +void +qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf, + uint64_t gen) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding write call", - fdctx ? fdctx->path : NULL, strerror (errno)); - goto unwind; - } + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; - STACK_WIND (frame, qr_writev_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->writev, fd, vector, count, off, - iobref); - return 0; + priv = this->private; + table = &priv->table; -unwind: - QR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); - return 0; + LOCK(&table->lock); + { + __qr_content_refresh(this, qr_inode, buf, gen); + } + UNLOCK(&table->lock); } - -int32_t -qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t off, struct iobref *iobref) +gf_boolean_t +__qr_cache_is_fresh(xlator_t *this, qr_inode_t *qr_inode) { - uint64_t value = 0; - int flags = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_inode_t *qr_inode = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t op_ret = -1, op_errno = -1, ret = -1; - char can_wind = 0, need_unwind = 0, need_open = 0; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - call_frame_t *open_frame = NULL; - - priv = this->private; - table = &priv->table; - - ret = fd_ctx_get (fd, this, &value); - - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - } - - LOCK (&table->lock); - { - ret = inode_ctx_get (fd->inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long)value; - if (qr_inode != NULL) { - inode_ctx_del (fd->inode, this, NULL); - __qr_inode_free (qr_inode); - } - } - } - UNLOCK (&table->lock); - - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_writev_stub (frame, qr_writev_helper, - fd, vector, count, off, - iobref); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; - if (need_unwind) { - QR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_writev_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->writev, fd, vector, count, - off, iobref); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + priv = this->private; + conf = &priv->conf; -ret: - return 0; -} + if (qr_inode->last_refresh < priv->last_child_down) + return _gf_false; + if (gf_time() - qr_inode->last_refresh >= conf->cache_timeout) + return _gf_false; -int32_t -qr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *buf) -{ - QR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); - return 0; + return _gf_true; } +int +qr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode_ret, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + void *content = NULL; + qr_inode_t *qr_inode = NULL; + inode_t *inode = NULL; + qr_local_t *local = NULL; + + local = frame->local; + inode = local->inode; + + if (op_ret == -1) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + if (dict_get(xdata, GLUSTERFS_BAD_INODE)) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + if (dict_get(xdata, "sh-failed")) { + qr_inode_prune(this, inode, local->incident_gen); + goto out; + } + + content = qr_content_extract(xdata); + + if (content) { + /* new content came along, always replace old content */ + qr_inode = qr_inode_ctx_get_or_new(this, inode); + if (!qr_inode) { + /* no harm done */ + GF_FREE(content); + goto out; + } + + qr_content_update(this, qr_inode, content, buf, local->incident_gen); + } else { + /* purge old content if necessary */ + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + /* usual path for large files */ + goto out; + + qr_content_refresh(this, qr_inode, buf, local->incident_gen); + } +out: + QR_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode_ret, buf, xdata, + postparent); + return 0; +} -int32_t -qr_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fstat call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } +int +qr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + qr_inode_t *qr_inode = NULL; + int ret = -1; + dict_t *new_xdata = NULL; + qr_local_t *local = NULL; + + priv = this->private; + conf = &priv->conf; + local = qr_local_get(this, loc->inode); + local->inode = inode_ref(loc->inode); + frame->local = local; + + qr_inode = qr_inode_ctx_get(this, loc->inode); + if (qr_inode && qr_inode->data) + /* cached. only validate in qr_lookup_cbk */ + goto wind; + + if (!xdata) + xdata = new_xdata = dict_new(); + + if (!xdata) + goto wind; + + ret = 0; + if (conf->max_file_size) + ret = dict_set(xdata, GF_CONTENT_KEY, + data_from_uint64(conf->max_file_size)); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_DICT_SET_FAILED, + "cannot set key in request dict (%s)", loc->path); +wind: + STACK_WIND(frame, qr_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); - STACK_WIND (frame, qr_fstat_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd); - return 0; + if (new_xdata) + dict_unref(new_xdata); -unwind: - QR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL); - return 0; + return 0; } - -int32_t -qr_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +int +qr_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) { - qr_fd_ctx_t *qr_fd_ctx = NULL; - char need_open = 0, can_wind = 0, need_unwind = 0; - uint64_t value = 0; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - call_stub_t *stub = NULL; - loc_t loc = {0, }; - char *path = NULL; - int flags = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto unwind; - } + gf_dirent_t *entry = NULL; + qr_inode_t *qr_inode = NULL; + qr_local_t *local = NULL; - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - } + local = frame->local; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fstat_stub (frame, qr_fstat_helper, - fd); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + if (op_ret <= 0) + goto unwind; -unwind: - if (need_unwind) { - QR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_fstat_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode) + continue; -ret: - return 0; -} + qr_inode = qr_inode_ctx_get(this, entry->inode); + if (!qr_inode) + /* no harm */ + continue; - -int32_t -qr_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, preop, postop); - return 0; -} - - -int32_t -qr_fsetattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) -{ - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fsetattr " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, qr_fsetattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, - valid); - return 0; + qr_content_refresh(this, qr_inode, &entry->d_stat, local->incident_gen); + } unwind: - QR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL); - return 0; + QR_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; } - -int32_t -qr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid) +int +qr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - uint64_t value = 0; - int flags = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" : - "fd is NULL"); - need_unwind = 1; - goto out; - } - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - } + qr_local_t *local = NULL; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fsetattr_stub (frame, - qr_fsetattr_helper, - fd, stbuf, valid); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } - -out: - if (need_unwind) { - QR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_fsetattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, - valid); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + local = qr_local_get(this, NULL); + frame->local = local; -ret: - return 0; + STACK_WIND(frame, qr_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata); + return 0; } - -int32_t -qr_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +int +qr_readv_cached(call_frame_t *frame, qr_inode_t *qr_inode, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - GF_ASSERT (frame); - QR_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno); - return 0; -} + xlator_t *this = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + int op_ret = -1; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = { + 0, + }; + struct iatt buf = { + 0, + }; + this = frame->this; + priv = this->private; + table = &priv->table; -int32_t -qr_fsetxattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - dict_t *dict, int32_t flags) -{ - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fsetxattr " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + LOCK(&table->lock); + { + if (!qr_inode->data) + goto unlock; - STACK_WIND (frame, qr_fsetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags); - return 0; + if (offset >= qr_inode->size) + goto unlock; -unwind: - QR_STACK_UNWIND (fsetxattr, frame, -1, op_errno); - return 0; -} + if (!__qr_cache_is_fresh(this, qr_inode)) + goto unlock; + op_ret = min(size, (qr_inode->size - offset)); -int32_t -qr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, - int32_t flags) -{ - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - int open_flags = 0; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) " - "is NULL" : "fd is NULL"); - need_unwind = 1; - goto out; + iobuf = iobuf_get2(this->ctx->iobuf_pool, op_ret); + if (!iobuf) { + op_ret = -1; + goto unlock; } - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; + iobref = iobref_new(); + if (!iobref) { + op_ret = -1; + goto unlock; } - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - open_flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fsetxattr_stub (frame, - qr_fsetxattr_helper, - fd, dict, flags); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + iobref_add(iobref, iobuf); -out: - if (need_unwind) { - QR_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno); - } else if (can_wind) { - STACK_WIND (frame, qr_fsetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsetxattr, fd, dict, - flags); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, open_flags, - fd, qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + memcpy(iobuf->ptr, qr_inode->data + offset, op_ret); -ret: - return 0; -} + buf = qr_inode->buf; + /* bump LRU */ + __qr_inode_register(frame->this, table, qr_inode); + } +unlock: + UNLOCK(&table->lock); -int32_t -qr_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict); - return 0; -} + if (op_ret >= 0) { + iov.iov_base = iobuf->ptr; + iov.iov_len = op_ret; + GF_ATOMIC_INC(priv->qr_counter.cache_hit); + STACK_UNWIND_STRICT(readv, frame, op_ret, 0, &iov, 1, &buf, iobref, + xdata); + } else { + GF_ATOMIC_INC(priv->qr_counter.cache_miss); + } -int32_t -qr_fgetxattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name) -{ - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fgetxattr " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + if (iobuf) + iobuf_unref(iobuf); - STACK_WIND (frame, qr_fgetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fgetxattr, fd, name); - return 0; + if (iobref) + iobref_unref(iobref); -unwind: - QR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL); - return 0; + return op_ret; } - -int32_t -qr_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name) +int +qr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - int flags = 0; - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - /* - * FIXME: Can quick-read use the extended attributes stored in the - * cache? this needs to be discussed. - */ - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" : - "fd is NULL"); - need_unwind = 1; - goto out; - } - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - } + qr_inode_t *qr_inode = NULL; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fgetxattr_stub (frame, - qr_fgetxattr_helper, - fd, name); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + qr_inode = qr_inode_ctx_get(this, fd->inode); + if (!qr_inode) + goto wind; -out: - if (need_unwind) { - QR_STACK_UNWIND (open, frame, op_ret, op_errno, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_fgetxattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fgetxattr, fd, name); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + if (qr_readv_cached(frame, qr_inode, size, offset, flags, xdata) < 0) + goto wind; -ret: - return 0; + return 0; +wind: + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; } - int32_t -qr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +qr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - GF_ASSERT (frame); - QR_STACK_UNWIND (flush, frame, op_ret, op_errno); - return 0; -} + qr_local_t *local = NULL; + local = frame->local; -int32_t -qr_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding flush call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + qr_inode_prune(this, local->fd->inode, local->incident_gen); - STACK_WIND (frame, qr_flush_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, fd); - return 0; - -unwind: - QR_STACK_UNWIND (flush, frame, -1, op_errno); - return 0; + QR_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } - -int32_t -qr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +int +qr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - uint64_t value = 0; - call_stub_t *stub = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char can_wind = 0, need_unwind = 0; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + qr_local_t *local = NULL; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - if (qr_fd_ctx->opened) { - can_wind = 1; - } else if (qr_fd_ctx->open_in_transit) { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_flush_stub (frame, qr_flush_helper, - fd); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } else { - op_ret = 0; - need_unwind = 1; - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); -out: - if (need_unwind) { - QR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } else if (can_wind) { - STACK_WIND (frame, qr_flush_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, fd); - } + frame->local = local; - return 0; + STACK_WIND(frame, qr_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, iov, count, offset, flags, + iobref, xdata); + return 0; } - int32_t -qr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +qr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - GF_ASSERT (frame); - QR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - return 0; -} + qr_local_t *local = NULL; + local = frame->local; + qr_inode_prune(this, local->inode, local->incident_gen); -int32_t -qr_fentrylk_helper (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, const char *basename, entrylk_cmd cmd, - entrylk_type type) + QR_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int +qr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fentrylk " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + qr_local_t *local = NULL; - STACK_WIND(frame, qr_fentrylk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, - cmd, type); - return 0; + local = qr_local_get(this, loc->inode); + local->inode = inode_ref(loc->inode); + frame->local = local; -unwind: - QR_STACK_UNWIND (fentrylk, frame, -1, op_errno); - return 0; + STACK_WIND(frame, qr_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; } - int32_t -qr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) +qr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - int flags = 0; - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } + qr_local_t *local = NULL; - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fentrylk_stub (frame, - qr_fentrylk_helper, - volume, fd, basename, - cmd, type); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } - -out: - if (need_unwind) { - QR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - } else if (can_wind) { - STACK_WIND (frame, qr_fentrylk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fentrylk, volume, fd, - basename, cmd, type); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } - -ret: - return 0; + QR_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } +int +qr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + qr_local_t *local = NULL; -int32_t -qr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - return 0; + STACK_WIND(frame, qr_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; } - int32_t -qr_finodelk_helper (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, int32_t cmd, struct gf_flock *lock) +qr_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding finodelk " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + qr_local_t *local = NULL; - STACK_WIND (frame, qr_finodelk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, lock); - return 0; + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); -unwind: - QR_STACK_UNWIND (finodelk, frame, -1, op_errno); - return 0; + QR_STACK_UNWIND(fallocate, frame, op_ret, op_errno, pre, post, xdata); + return 0; } - -int32_t -qr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - int32_t cmd, struct gf_flock *lock) +static int +qr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int keep_size, + off_t offset, size_t len, dict_t *xdata) { - int flags = 0; - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + qr_local_t *local = NULL; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_finodelk_stub (frame, - qr_finodelk_helper, - volume, fd, cmd, - lock); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; -out: - if (need_unwind) { - QR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } else if (can_wind) { - STACK_WIND (frame, qr_finodelk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->finodelk, volume, fd, - cmd, lock); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } - -ret: - return 0; + STACK_WIND(frame, qr_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len, + xdata); + return 0; } - -int32_t -qr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf) -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - - int32_t -qr_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +qr_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding fsync call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + qr_local_t *local = NULL; - STACK_WIND (frame, qr_fsync_cbk, FIRST_CHILD (this), - FIRST_CHILD(this)->fops->fsync, fd, flags); - return 0; + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); -unwind: - QR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); - return 0; + QR_STACK_UNWIND(discard, frame, op_ret, op_errno, pre, post, xdata); + return 0; } - -int32_t -qr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) +static int +qr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - int open_flags = 0; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } + qr_local_t *local = NULL; - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - open_flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_fsync_stub (frame, qr_fsync_helper, - fd, flags); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } - -out: - if (need_unwind) { - QR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_fsync_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, fd, flags); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, open_flags, - fd, qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } - -ret: - return 0; + STACK_WIND(frame, qr_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; } - int32_t -qr_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +qr_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - int32_t ret = 0; - uint64_t value = 0; - qr_inode_t *qr_inode = NULL; - qr_local_t *local = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; + qr_local_t *local = NULL; - GF_ASSERT (frame); + local = frame->local; + qr_inode_prune(this, local->fd->inode, local->incident_gen); - if (op_ret == -1) { - goto out; - } - - local = frame->local; - if ((local == NULL) || (local->fd == NULL) - || (local->fd->inode == NULL)) { - op_ret = -1; - op_errno = EINVAL; - gf_log (frame->this->name, GF_LOG_WARNING, "cannot get inode"); - goto out; - } - - if ((this == NULL) || (this->private == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "cannot get quick read configuration from xlator " - "object"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } + QR_STACK_UNWIND(zerofill, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} - priv = this->private; - table = &priv->table; +static int +qr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + qr_local_t *local = NULL; - LOCK (&table->lock); - { - ret = inode_ctx_get (local->fd->inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long) value; - - if (qr_inode) { - if (qr_inode->stbuf.ia_size != postbuf->ia_size) - { - inode_ctx_del (local->fd->inode, this, - NULL); - __qr_inode_free (qr_inode); - } - } - } - } - UNLOCK (&table->lock); + local = qr_local_get(this, fd->inode); + local->fd = fd_ref(fd); + frame->local = local; -out: - QR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + STACK_WIND(frame, qr_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; } - -int32_t -qr_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) +int +qr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding ftruncate " - "call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, qr_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - return 0; + qr_inode_set_priority(this, fd->inode, loc->path); -unwind: - QR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; } - -int32_t -qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +int +qr_forget(xlator_t *this, inode_t *inode) { - int flags = 0; - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_local_t *local = NULL; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } + qr_inode_t *qr_inode = NULL; - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + qr_inode = qr_inode_ctx_get(this, inode); - local = GF_CALLOC (1, sizeof (*local), gf_qr_mt_qr_local_t); - if (local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - goto out; - } + if (!qr_inode) + return 0; - local->fd = fd; - frame->local = local; - - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - stub = fop_ftruncate_stub (frame, - qr_ftruncate_helper, - fd, offset); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + qr_inode_prune(this, inode, qr_get_generation(this, inode)); -out: - if (need_unwind) { - QR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, - NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } + GF_FREE(qr_inode); -ret: - return 0; + return 0; } - int32_t -qr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct gf_flock *lock) -{ - GF_ASSERT (frame); - QR_STACK_UNWIND (lk, frame, op_ret, op_errno, lock); - return 0; +qr_inodectx_dump(xlator_t *this, inode_t *inode) +{ + qr_inode_t *qr_inode = NULL; + int32_t ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char buf[GF_TIMESTR_SIZE] = { + 0, + }; + + qr_inode = qr_inode_ctx_get(this, inode); + if (!qr_inode) + goto out; + + gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read", + "inodectx"); + gf_proc_dump_add_section("%s", key_prefix); + + gf_proc_dump_write("entire-file-cached", "%s", + qr_inode->data ? "yes" : "no"); + + if (qr_inode->last_refresh) { + gf_time_fmt(buf, sizeof buf, qr_inode->last_refresh, gf_timefmt_FT); + gf_proc_dump_write("last-cache-validation-time", "%s", buf); + } + + ret = 0; +out: + return ret; } - -int32_t -qr_lk_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) +int +qr_priv_dump(xlator_t *this) { - qr_local_t *local = NULL; - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - - local = frame->local; - GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - if (local->op_ret < 0) { - op_errno = local->op_errno; - - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - fdctx = (qr_fd_ctx_t *)(long) value; - } - - gf_log (this->name, GF_LOG_WARNING, - "open failed on path (%s) (%s), unwinding lk call", - fdctx ? fdctx->path : NULL, strerror (op_errno)); - goto unwind; - } + qr_conf_t *conf = NULL; + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; + uint32_t file_count = 0; + uint32_t i = 0; + qr_inode_t *curr = NULL; + uint64_t total_size = 0; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; - STACK_WIND (frame, qr_lk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, fd, cmd, lock); + if (!this) { + return -1; + } - return 0; + priv = this->private; + conf = &priv->conf; + if (!conf) + return -1; -unwind: - QR_STACK_UNWIND (lk, frame, -1, op_errno, NULL); - return 0; -} + table = &priv->table; + gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read", "priv"); -int32_t -qr_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock) -{ - int flags = 0; - uint64_t value = 0; - call_stub_t *stub = NULL; - char *path = NULL; - loc_t loc = {0, }; - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - char need_open = 0, can_wind = 0, need_unwind = 0; - call_frame_t *open_frame = NULL; - - GF_ASSERT (frame); - if ((this == NULL) || (fd == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "fd is NULL"); - need_unwind = 1; - goto out; - } + gf_proc_dump_add_section("%s", key_prefix); - ret = fd_ctx_get (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long)value; - } + gf_proc_dump_write("max_file_size", "%" PRIu64, conf->max_file_size); + gf_proc_dump_write("cache_timeout", "%d", conf->cache_timeout); - if (qr_fd_ctx) { - LOCK (&qr_fd_ctx->lock); - { - path = qr_fd_ctx->path; - flags = qr_fd_ctx->flags; - - if (!(qr_fd_ctx->opened - || qr_fd_ctx->open_in_transit)) { - need_open = 1; - qr_fd_ctx->open_in_transit = 1; - } - - if (qr_fd_ctx->opened) { - can_wind = 1; - } else { - frame->local = GF_CALLOC (1, - sizeof (qr_local_t), - gf_qr_mt_qr_local_t); - if (frame->local == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - stub = fop_lk_stub (frame, qr_lk_helper, fd, - cmd, lock); - if (stub == NULL) { - op_ret = -1; - op_errno = ENOMEM; - need_unwind = 1; - qr_fd_ctx->open_in_transit = 0; - goto unlock; - } - - list_add_tail (&stub->list, - &qr_fd_ctx->waiting_ops); - } - } - unlock: - UNLOCK (&qr_fd_ctx->lock); - } else { - can_wind = 1; - } + if (!table) { + goto out; + } else { + for (i = 0; i < conf->max_pri; i++) { + list_for_each_entry(curr, &table->lru[i], lru) + { + file_count++; + total_size += curr->size; + } + } + } + + gf_proc_dump_write("total_files_cached", "%d", file_count); + gf_proc_dump_write("total_cache_used", "%" PRIu64, total_size); + gf_proc_dump_write("cache-hit", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.cache_hit)); + gf_proc_dump_write("cache-miss", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.cache_miss)); + gf_proc_dump_write("cache-invalidations", "%" GF_PRI_ATOMIC, + GF_ATOMIC_GET(priv->qr_counter.file_data_invals)); out: - if (need_unwind) { - QR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); - } else if (can_wind) { - STACK_WIND (frame, qr_lk_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, fd, cmd, lock); - } else if (need_open) { - op_ret = qr_loc_fill (&loc, fd->inode, path); - if (op_ret == -1) { - qr_resume_pending_ops (qr_fd_ctx, -1, errno); - goto ret; - } - - open_frame = create_frame (this, this->ctx->pool); - if (open_frame == NULL) { - qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM); - qr_loc_wipe (&loc); - goto ret; - } - - STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, &loc, flags, fd, - qr_fd_ctx->wbflags); - - qr_loc_wipe (&loc); - } - -ret: - return 0; + return 0; } - -int32_t -qr_release (xlator_t *this, fd_t *fd) +static int32_t +qr_dump_metrics(xlator_t *this, int fd) { - qr_fd_ctx_t *qr_fd_ctx = NULL; - int32_t ret = 0; - uint64_t value = 0; - - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - ret = fd_ctx_del (fd, this, &value); - if (ret == 0) { - qr_fd_ctx = (qr_fd_ctx_t *)(long) value; - if (qr_fd_ctx) { - qr_fd_ctx_free (qr_fd_ctx); - } - } + qr_private_t *priv = NULL; + qr_inode_table_t *table = NULL; -out: - return 0; -} + priv = this->private; + table = &priv->table; + dprintf(fd, "%s.total_files_cached %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.files_cached)); + dprintf(fd, "%s.total_cache_used %" PRId64 "\n", this->name, + table->cache_used); + dprintf(fd, "%s.cache-hit %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.cache_hit)); + dprintf(fd, "%s.cache-miss %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.cache_miss)); + dprintf(fd, "%s.cache-invalidations %" PRId64 "\n", this->name, + GF_ATOMIC_GET(priv->qr_counter.file_data_invals)); + + return 0; +} int32_t -qr_forget (xlator_t *this, inode_t *inode) +qr_mem_acct_init(xlator_t *this) { - qr_inode_t *qr_inode = NULL; - uint64_t value = 0; - int32_t ret = -1; - qr_private_t *priv = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + if (!this) + return ret; - priv = this->private; - - LOCK (&priv->table.lock); - { - ret = inode_ctx_del (inode, this, &value); - if (ret == 0) { - qr_inode = (qr_inode_t *)(long) value; - __qr_inode_free (qr_inode); - } - } - UNLOCK (&priv->table.lock); + ret = xlator_mem_acct_init(this, gf_qr_mt_end + 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, QUICK_READ_MSG_NO_MEMORY, + "Memory accounting init failed"); + return ret; + } + + return ret; +} + +static gf_boolean_t +check_cache_size_ok(xlator_t *this, int64_t cache_size) +{ + int ret = _gf_true; + uint64_t total_mem = 0; + uint64_t max_cache_size = 0; + volume_option_t *opt = NULL; + + GF_ASSERT(this); + opt = xlator_volume_option_get(this, "cache-size"); + if (!opt) { + ret = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, EINVAL, + QUICK_READ_MSG_INVALID_ARGUMENT, + "could not get cache-size option"); + goto out; + } + + total_mem = get_mem_size(); + if (-1 == total_mem) + max_cache_size = opt->max; + else + max_cache_size = total_mem; + + gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size); + if (cache_size > max_cache_size) { + ret = _gf_false; + gf_msg(this->name, GF_LOG_ERROR, 0, QUICK_READ_MSG_INVALID_ARGUMENT, + "Cache size %" PRIu64 + " is greater than the max size of %" PRIu64, + cache_size, max_cache_size); + goto out; + } out: - return 0; + return ret; } - -int32_t -qr_inodectx_dump (xlator_t *this, inode_t *inode) +int +qr_reconfigure(xlator_t *this, dict_t *options) { - qr_inode_t *qr_inode = NULL; - uint64_t value = 0; - int32_t ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - char buf[256] = {0, }; - struct tm *tm = NULL; - ret = inode_ctx_get (inode, this, &value); - if (ret != 0) { - goto out; - } + int32_t ret = -1; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + uint64_t cache_size_new = 0; - qr_inode = (qr_inode_t *)(long)value; - if (qr_inode == NULL) { - goto out; - } + GF_VALIDATE_OR_GOTO("quick-read", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, options, out); + + priv = this->private; - gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", - "inodectx"); - gf_proc_dump_add_section (key_prefix); + conf = &priv->conf; + if (!conf) { + goto out; + } - gf_proc_dump_write ("entire-file-cached", "%s", qr_inode->xattr ? "yes" : "no"); + GF_OPTION_RECONF("cache-timeout", conf->cache_timeout, options, int32, out); - tm = localtime (&qr_inode->tv.tv_sec); - strftime (buf, 256, "%Y-%m-%d %H:%M:%S", tm); - snprintf (buf + strlen (buf), 256 - strlen (buf), - ".%"GF_PRI_SUSECONDS, qr_inode->tv.tv_usec); + GF_OPTION_RECONF("quick-read-cache-invalidation", conf->qr_invalidation, + options, bool, out); - gf_proc_dump_write ("last-cache-validation-time", "%s", buf); + GF_OPTION_RECONF("ctime-invalidation", conf->ctime_invalidation, options, + bool, out); - ret = 0; + GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64, out); + if (!check_cache_size_ok(this, cache_size_new)) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, EINVAL, QUICK_READ_MSG_INVALID_CONFIG, + "Not reconfiguring cache-size"); + goto out; + } + conf->cache_size = cache_size_new; + + ret = 0; out: - return ret; + return ret; } int32_t -qr_fdctx_dump (xlator_t *this, fd_t *fd) -{ - qr_fd_ctx_t *fdctx = NULL; - uint64_t value = 0; - int32_t ret = 0, i = 0; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - call_stub_t *stub = NULL; - - ret = fd_ctx_get (fd, this, &value); - if (ret != 0) { - goto out; - } - - fdctx = (qr_fd_ctx_t *)(long)value; - if (fdctx == NULL) { - goto out; +qr_get_priority_list(const char *opt_str, struct list_head *first) +{ + int32_t max_pri = 1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *priority_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = NULL; + struct qr_priority *curr = NULL, *tmp = NULL; + + GF_VALIDATE_OR_GOTO("quick-read", opt_str, out); + GF_VALIDATE_OR_GOTO("quick-read", first, out); + + string = gf_strdup(opt_str); + if (string == NULL) { + max_pri = -1; + goto out; + } + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + priority_str = strtok_r(string, ",", &tmp_str); + while (priority_str) { + curr = GF_CALLOC(1, sizeof(*curr), gf_qr_mt_qr_priority_t); + if (curr == NULL) { + max_pri = -1; + goto out; + } + + list_add_tail(&curr->list, first); + + dup_str = gf_strdup(priority_str); + if (dup_str == NULL) { + max_pri = -1; + goto out; + } + + pattern = strtok_r(dup_str, ":", &tmp_str1); + if (!pattern) { + max_pri = -1; + goto out; + } + + priority = strtok_r(NULL, ":", &tmp_str1); + if (!priority) { + max_pri = -1; + goto out; + } + + gf_msg_trace("quick-read", 0, + "quick-read priority : pattern %s : priority %s", pattern, + priority); + + curr->pattern = gf_strdup(pattern); + if (curr->pattern == NULL) { + max_pri = -1; + goto out; + } + + curr->priority = strtol(priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) { + max_pri = -1; + goto out; + } else { + max_pri = max(max_pri, curr->priority); } - gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", - "fdctx"); - gf_proc_dump_add_section (key_prefix); + GF_FREE(dup_str); + dup_str = NULL; - gf_proc_dump_write ("fd", "%p", fd); + priority_str = strtok_r(NULL, ",", &tmp_str); + } +out: + GF_FREE(string); - gf_proc_dump_write ("path", "%s", fdctx->path); + GF_FREE(dup_str); - LOCK (&fdctx->lock); + if (max_pri == -1) { + list_for_each_entry_safe(curr, tmp, first, list) { - gf_proc_dump_write ("opened", "%s", fdctx->opened ? "yes" : "no"); - - gf_proc_dump_write ("open-in-progress", "%s", fdctx->open_in_transit ? - "yes" : "no"); - - gf_proc_dump_write ("caching disabled (for this fd)", "%s", - fdctx->disabled ? "yes" : "no"); - - gf_proc_dump_write ("flags", "%d", fdctx->flags); - - gf_proc_dump_write ("wbflags", "%d", fdctx->wbflags); - - list_for_each_entry (stub, &fdctx->waiting_ops, list) { - gf_proc_dump_build_key (key, "", - "waiting-ops[%d].frame", i); - gf_proc_dump_write (key, "%"PRId64, - stub->frame->root->unique); - - gf_proc_dump_build_key (key, "", - "waiting-ops[%d].fop", i); - gf_proc_dump_write (key, "%s", gf_fop_list[stub->fop]); - - i++; - } + list_del_init(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); } - UNLOCK (&fdctx->lock); + } - ret = 0; -out: - return ret; + return max_pri; } -int -qr_priv_dump (xlator_t *this) -{ - qr_conf_t *conf = NULL; - qr_private_t *priv = NULL; - qr_inode_table_t *table = NULL; - uint32_t file_count = 0; - uint32_t i = 0; - qr_inode_t *curr = NULL; - uint64_t total_size = 0; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - - if (!this) { - return -1; - } - - priv = this->private; - conf = &priv->conf; - - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, "conf null in xlator"); - return -1; - } - - table = &priv->table; - +int32_t +qr_init(xlator_t *this) +{ + int32_t ret = -1, i = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: volume (%s) not configured with exactly one " + "child", + this->name); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(*priv), gf_qr_mt_qr_private_t); + if (priv == NULL) { + ret = -1; + goto out; + } + + LOCK_INIT(&priv->table.lock); + conf = &priv->conf; + + GF_OPTION_INIT("max-file-size", conf->max_file_size, size_uint64, out); + + GF_OPTION_INIT("cache-timeout", conf->cache_timeout, int32, out); + + GF_OPTION_INIT("quick-read-cache-invalidation", conf->qr_invalidation, bool, + out); + + GF_OPTION_INIT("cache-size", conf->cache_size, size_uint64, out); + if (!check_cache_size_ok(this, conf->cache_size)) { + ret = -1; + goto out; + } + + GF_OPTION_INIT("ctime-invalidation", conf->ctime_invalidation, bool, out); + + INIT_LIST_HEAD(&conf->priority_list); + conf->max_pri = 1; + if (dict_get(this->options, "priority")) { + char *option_list = data_to_str(dict_get(this->options, "priority")); + gf_msg_trace(this->name, 0, "option path %s", option_list); + /* parse the list of pattern:priority */ + conf->max_pri = qr_get_priority_list(option_list, &conf->priority_list); + + if (conf->max_pri == -1) { + goto out; + } + conf->max_pri++; + } + + priv->table.lru = GF_CALLOC(conf->max_pri, sizeof(*priv->table.lru), + gf_common_mt_list_head); + if (priv->table.lru == NULL) { + ret = -1; + goto out; + } + + for (i = 0; i < conf->max_pri; i++) { + INIT_LIST_HEAD(&priv->table.lru[i]); + } + + ret = 0; + + priv->last_child_down = gf_time(); + GF_ATOMIC_INIT(priv->generation, 0); + this->private = priv; +out: + if ((ret == -1) && priv) { + GF_FREE(priv); + } - gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read", - "priv"); + return ret; +} - gf_proc_dump_add_section (key_prefix); +void +qr_inode_table_destroy(qr_private_t *priv) +{ + int i = 0; + qr_conf_t *conf = NULL; - gf_proc_dump_write ("max_file_size", "%d", conf->max_file_size); - gf_proc_dump_write ("cache_timeout", "%d", conf->cache_timeout); + conf = &priv->conf; - if (!table) { - gf_log (this->name, GF_LOG_WARNING, "table is NULL"); - goto out; - } else { - for (i = 0; i < conf->max_pri; i++) { - list_for_each_entry (curr, &table->lru[i], lru) { - file_count++; - total_size += curr->stbuf.ia_size; - } - } + for (i = 0; i < conf->max_pri; i++) { + /* There is a known leak of inodes, hence until + * that is fixed, log the assert as warning. + GF_ASSERT (list_empty (&priv->table.lru[i]));*/ + if (!list_empty(&priv->table.lru[i])) { + gf_msg("quick-read", GF_LOG_INFO, 0, QUICK_READ_MSG_LRU_NOT_EMPTY, + "quick read inode table lru not empty"); } + } - gf_proc_dump_write ("total_files_cached", "%d", file_count); - gf_proc_dump_write ("total_cache_used", "%d", total_size); + LOCK_DESTROY(&priv->table.lock); -out: - return 0; + return; } - -int32_t -mem_acct_init (xlator_t *this) +void +qr_conf_destroy(qr_conf_t *conf) { - int ret = -1; - - if (!this) - return ret; + struct qr_priority *curr = NULL, *tmp = NULL; - ret = xlator_mem_acct_init (this, gf_qr_mt_end + 1); + list_for_each_entry_safe(curr, tmp, &conf->priority_list, list) + { + list_del(&curr->list); + GF_FREE(curr->pattern); + GF_FREE(curr); + } - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; + return; } -gf_boolean_t -check_cache_size_ok (xlator_t *this, int64_t cache_size) +void +qr_update_child_down_time(xlator_t *this, time_t now) { - int ret = _gf_true; - uint64_t total_mem = 0; - uint64_t max_cache_size = 0; - volume_option_t *opt = NULL; - - GF_ASSERT (this); - opt = xlator_volume_option_get (this, "cache-size"); - if (!opt) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, - "could not get cache-size option"); - goto out; - } + qr_private_t *priv = NULL; - total_mem = get_mem_size (); - if (-1 == total_mem) - max_cache_size = opt->max; - else - max_cache_size = total_mem; - - gf_log (this->name, GF_LOG_INFO, "Max cache size is %"PRIu64, - max_cache_size); - if (cache_size > max_cache_size) { - ret = _gf_false; - gf_log (this->name, GF_LOG_ERROR, "Cache size %"PRIu64 - " is greater than the max size of %"PRIu64, - cache_size, max_cache_size); - goto out; - } -out: - return ret; + priv = this->private; + + LOCK(&priv->lock); + { + priv->last_child_down = now; + } + UNLOCK(&priv->lock); } -int -reconfigure (xlator_t *this, dict_t *options) +static int +qr_invalidate(xlator_t *this, void *data) { - int32_t ret = -1; - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - uint64_t cache_size_new = 0; - GF_VALIDATE_OR_GOTO ("quick-read", this, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); - GF_VALIDATE_OR_GOTO (this->name, options, out); - - priv = this->private; - - conf = &priv->conf; - if (!conf) { - goto out; - } + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + inode_t *inode = NULL; + int ret = 0; + inode_table_t *itable = NULL; + qr_private_t *priv = NULL; - GF_OPTION_RECONF ("cache-timeout", conf->cache_timeout, options, int32, - out); + up_data = (struct gf_upcall *)data; - GF_OPTION_RECONF ("cache-size", cache_size_new, options, size, out); - if (!check_cache_size_ok (this, cache_size_new)) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "Not reconfiguring cache-size"); - goto out; - } - conf->cache_size = cache_size_new; - - ret = 0; -out: - return ret; -} + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + goto out; + priv = this->private; + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; -int32_t -qr_get_priority_list (const char *opt_str, struct list_head *first) -{ - int32_t max_pri = 1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *tmp_str2 = NULL; - char *dup_str = NULL; - char *priority_str = NULL; - char *pattern = NULL; - char *priority = NULL; - char *string = NULL; - struct qr_priority *curr = NULL, *tmp = NULL; - - GF_VALIDATE_OR_GOTO ("quick-read", opt_str, out); - GF_VALIDATE_OR_GOTO ("quick-read", first, out); - - string = gf_strdup (opt_str); - if (string == NULL) { - max_pri = -1; - goto out; + if (up_ci && (up_ci->flags & UP_WRITE_FLAGS)) { + GF_ATOMIC_INC(priv->qr_counter.file_data_invals); + itable = ((xlator_t *)this->graph->top)->itable; + inode = inode_find(itable, up_data->gfid); + if (!inode) { + ret = -1; + goto out; } + qr_inode_prune(this, inode, qr_get_generation(this, inode)); + } - /* Get the pattern for cache priority. - * "option priority *.jpg:1,abc*:2" etc - */ - /* TODO: inode_lru in table is statically hard-coded to 5, - * should be changed to run-time configuration - */ - priority_str = strtok_r (string, ",", &tmp_str); - while (priority_str) { - curr = GF_CALLOC (1, sizeof (*curr), gf_qr_mt_qr_priority_t); - if (curr == NULL) { - max_pri = -1; - goto out; - } - - list_add_tail (&curr->list, first); - - dup_str = gf_strdup (priority_str); - if (dup_str == NULL) { - max_pri = -1; - goto out; - } - - pattern = strtok_r (dup_str, ":", &tmp_str1); - if (!pattern) { - max_pri = -1; - goto out; - } - - priority = strtok_r (NULL, ":", &tmp_str1); - if (!priority) { - max_pri = -1; - goto out; - } - - gf_log ("quick-read", GF_LOG_TRACE, - "quick-read priority : pattern %s : priority %s", - pattern, - priority); - - curr->pattern = gf_strdup (pattern); - if (curr->pattern == NULL) { - max_pri = -1; - goto out; - } - - curr->priority = strtol (priority, &tmp_str2, 0); - if (tmp_str2 && (*tmp_str2)) { - max_pri = -1; - goto out; - } else { - max_pri = max (max_pri, curr->priority); - } - - GF_FREE (dup_str); - dup_str = NULL; - - priority_str = strtok_r (NULL, ",", &tmp_str); - } out: - if (string != NULL) { - GF_FREE (string); - } + if (inode) + inode_unref(inode); - if (dup_str != NULL) { - GF_FREE (dup_str); - } - - if (max_pri == -1) { - list_for_each_entry_safe (curr, tmp, first, list) { - list_del_init (&curr->list); - GF_FREE (curr->pattern); - GF_FREE (curr); - } - } - - return max_pri; + return ret; } - -int32_t -init (xlator_t *this) +int +qr_notify(xlator_t *this, int event, void *data, ...) { - int32_t ret = -1, i = 0; - qr_private_t *priv = NULL; - qr_conf_t *conf = NULL; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: volume (%s) not configured with exactly one " - "child", this->name); - return -1; - } + int ret = 0; + qr_private_t *priv = NULL; + qr_conf_t *conf = NULL; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } + priv = this->private; + conf = &priv->conf; - priv = GF_CALLOC (1, sizeof (*priv), gf_qr_mt_qr_private_t); - if (priv == NULL) { - ret = -1; - goto out; - } + switch (event) { + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_SOME_DESCENDENT_DOWN: + qr_update_child_down_time(this, gf_time()); + break; + case GF_EVENT_UPCALL: + if (conf->qr_invalidation) + ret = qr_invalidate(this, data); + break; + default: + break; + } - LOCK_INIT (&priv->table.lock); - conf = &priv->conf; + if (default_notify(this, event, data) != 0) + ret = -1; - GF_OPTION_INIT ("max-file-size", conf->max_file_size, size, out); - - GF_OPTION_INIT ("cache-timeout", conf->cache_timeout, int32, out); + return ret; +} - GF_OPTION_INIT ("cache-size", conf->cache_size, size, out); - if (!check_cache_size_ok (this, conf->cache_size)) { - ret = -1; - goto out; - } +void +qr_fini(xlator_t *this) +{ + qr_private_t *priv = NULL; - INIT_LIST_HEAD (&conf->priority_list); - conf->max_pri = 1; - if (dict_get (this->options, "priority")) { - char *option_list = data_to_str (dict_get (this->options, - "priority")); - gf_log (this->name, GF_LOG_TRACE, - "option path %s", option_list); - /* parse the list of pattern:priority */ - conf->max_pri = qr_get_priority_list (option_list, - &conf->priority_list); - - if (conf->max_pri == -1) { - goto out; - } - conf->max_pri ++; - } + if (this == NULL) { + goto out; + } - priv->table.lru = GF_CALLOC (conf->max_pri, sizeof (*priv->table.lru), - gf_common_mt_list_head); - if (priv->table.lru == NULL) { - ret = -1; - goto out; - } + priv = this->private; + if (priv == NULL) { + goto out; + } - for (i = 0; i < conf->max_pri; i++) { - INIT_LIST_HEAD (&priv->table.lru[i]); - } + qr_inode_table_destroy(priv); + qr_conf_destroy(&priv->conf); - ret = 0; + this->private = NULL; - this->private = priv; + GF_FREE(priv); out: - if ((ret == -1) && priv) { - GF_FREE (priv); - } - - return ret; -} - - -void -fini (xlator_t *this) -{ - return; -} - -struct xlator_fops fops = { - .lookup = qr_lookup, - .open = qr_open, - .readv = qr_readv, - .writev = qr_writev, - .fstat = qr_fstat, - .fsetxattr = qr_fsetxattr, - .fgetxattr = qr_fgetxattr, - .flush = qr_flush, - .fentrylk = qr_fentrylk, - .finodelk = qr_finodelk, - .fsync = qr_fsync, - .ftruncate = qr_ftruncate, - .lk = qr_lk, - .fsetattr = qr_fsetattr, -}; - -struct xlator_cbks cbks = { - .forget = qr_forget, - .release = qr_release, + return; +} + +struct xlator_fops qr_fops = {.lookup = qr_lookup, + .readdirp = qr_readdirp, + .open = qr_open, + .readv = qr_readv, + .writev = qr_writev, + .truncate = qr_truncate, + .ftruncate = qr_ftruncate, + .fallocate = qr_fallocate, + .discard = qr_discard, + .zerofill = qr_zerofill}; + +struct xlator_cbks qr_cbks = { + .forget = qr_forget, }; -struct xlator_dumpops dumpops = { - .priv = qr_priv_dump, - .inodectx = qr_inodectx_dump, - .fdctx = qr_fdctx_dump +struct xlator_dumpops qr_dumpops = { + .priv = qr_priv_dump, + .inodectx = qr_inodectx_dump, }; -struct volume_options options[] = { - { .key = {"priority"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"cache-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 0, - .max = 32 * GF_UNIT_GB, - .default_value = "128MB", - .description = "Size of the read cache." - }, - { .key = {"cache-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 60, - .default_value = "1", - }, - { .key = {"max-file-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 0, - .max = 1 * GF_UNIT_KB * 1000, - .default_value = "64KB", - }, +struct volume_options qr_options[] = { + { + .key = {"quick-read"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable quick-read", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"priority"}, .type = GF_OPTION_TYPE_ANY}, + {.key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = INFINITY, + .default_value = "128MB", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "Size of small file read cache."}, + { + .key = {"cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "1", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + }, + { + .key = {"max-file-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 1 * GF_UNIT_KB * 1000, + .default_value = "64KB", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + }, + { + .key = {"quick-read-cache-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "When \"on\", invalidates/updates the metadata cache," + " on receiving the cache-invalidation notifications", + }, + { + .key = {"ctime-invalidation"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .description = "Quick-read by default uses mtime to identify changes " + "to file data. However there are applications like " + "rsync which explicitly set mtime making it unreliable " + "for the purpose of identifying change in file content " + ". Since ctime also changes when content of a file " + " changes and it cannot be set explicitly, it becomes " + " suitable for identifying staleness of cached data. " + "This option makes quick-read to prefer ctime over " + "mtime to validate its cache. However, using ctime " + "can result in false positives as ctime changes with " + "just attribute changes like permission without " + "changes to file data. So, use this only when mtime " + "is not reliable", + }, + {.key = {NULL}}}; + +xlator_api_t xlator_api = { + .init = qr_init, + .fini = qr_fini, + .notify = qr_notify, + .reconfigure = qr_reconfigure, + .mem_acct_init = qr_mem_acct_init, + .dump_metrics = qr_dump_metrics, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &qr_dumpops, + .fops = &qr_fops, + .cbks = &qr_cbks, + .options = qr_options, + .identifier = "quick-read", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h index 064151b634e..20fcc70b3a7 100644 --- a/xlators/performance/quick-read/src/quick-read.h +++ b/xlators/performance/quick-read/src/quick-read.h @@ -1,40 +1,26 @@ /* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __QUICK_READ_H #define __QUICK_READ_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" -#include "call-stub.h" -#include "defaults.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/list.h> +#include <glusterfs/compat.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> #include <libgen.h> #include <sys/time.h> #include <sys/types.h> @@ -43,76 +29,63 @@ #include <fnmatch.h> #include "quick-read-mem-types.h" -struct qr_fd_ctx { - char opened; - char disabled; - char open_in_transit; - char *path; - int flags; - int wbflags; - struct list_head waiting_ops; - gf_lock_t lock; -}; -typedef struct qr_fd_ctx qr_fd_ctx_t; - -struct qr_local { - char is_open; - char *path; - char just_validated; - fd_t *fd; - int open_flags; - int32_t op_ret; - int32_t op_errno; - call_stub_t *stub; -}; -typedef struct qr_local qr_local_t; - struct qr_inode { - dict_t *xattr; - inode_t *inode; - int priority; - struct iatt stbuf; - struct timeval tv; - struct list_head lru; + void *data; + size_t size; + int priority; + uint32_t ia_mtime; + uint32_t ia_mtime_nsec; + uint32_t ia_ctime; + uint32_t ia_ctime_nsec; + uint32_t gen_rollover; + struct iatt buf; + time_t last_refresh; + struct list_head lru; + uint64_t gen; + uint64_t invalidation_time; }; typedef struct qr_inode qr_inode_t; struct qr_priority { - char *pattern; - int32_t priority; - struct list_head list; + char *pattern; + int32_t priority; + struct list_head list; }; typedef struct qr_priority qr_priority_t; struct qr_conf { - uint64_t max_file_size; - int32_t cache_timeout; - uint64_t cache_size; - int max_pri; - struct list_head priority_list; + uint64_t max_file_size; + int32_t cache_timeout; + uint64_t cache_size; + int max_pri; + gf_boolean_t qr_invalidation; + gf_boolean_t ctime_invalidation; + struct list_head priority_list; }; typedef struct qr_conf qr_conf_t; struct qr_inode_table { - uint64_t cache_used; - struct list_head *lru; - gf_lock_t lock; + uint64_t cache_used; + struct list_head *lru; + gf_lock_t lock; }; typedef struct qr_inode_table qr_inode_table_t; +struct qr_statistics { + gf_atomic_t cache_hit; + gf_atomic_t cache_miss; + gf_atomic_t file_data_invals; /* No. of invalidates received from upcall */ + gf_atomic_t files_cached; +}; + struct qr_private { - qr_conf_t conf; - qr_inode_table_t table; + qr_conf_t conf; + qr_inode_table_t table; + time_t last_child_down; + gf_lock_t lock; + struct qr_statistics qr_counter; + gf_atomic_int32_t generation; }; typedef struct qr_private qr_private_t; -void qr_local_free (qr_local_t *local); - -#define QR_STACK_UNWIND(op, frame, params ...) do { \ - qr_local_t *__local = frame->local; \ - frame->local = NULL; \ - STACK_UNWIND_STRICT (op, frame, params); \ - qr_local_free (__local); \ - } while (0) - #endif /* #ifndef __QUICK_READ_H */ diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am index b46020aacee..99efca3660c 100644 --- a/xlators/performance/read-ahead/src/Makefile.am +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = read-ahead.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -read_ahead_la_LDFLAGS = -module -avoidversion +read_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) read_ahead_la_SOURCES = read-ahead.c page.c read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = read-ahead.h read-ahead-mem-types.h +noinst_HEADERS = read-ahead.h read-ahead-mem-types.h read-ahead-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c index 9778ef54258..8a58ad8bb7a 100644 --- a/xlators/performance/read-ahead/src/page.c +++ b/xlators/performance/read-ahead/src/page.c @@ -1,444 +1,455 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" #include <assert.h> +#include "read-ahead-messages.h" ra_page_t * -ra_page_get (ra_file_t *file, off_t offset) +ra_page_get(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; + ra_page_t *page = NULL; + off_t rounded_offset = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - if (page == &file->pages || page->offset != rounded_offset) - page = NULL; + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; out: - return page; + return page; } - ra_page_t * -ra_page_create (ra_file_t *file, off_t offset) +ra_page_create(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; - ra_page_t *newpage = NULL; + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - if (page == &file->pages || page->offset != rounded_offset) { - newpage = GF_CALLOC (1, sizeof (*newpage), gf_ra_mt_ra_page_t); - if (!newpage) { - goto out; - } + if (page == &file->pages || page->offset != rounded_offset) { + newpage = GF_CALLOC(1, sizeof(*newpage), gf_ra_mt_ra_page_t); + if (!newpage) { + goto out; + } - newpage->offset = rounded_offset; - newpage->prev = page->prev; - newpage->next = page; - newpage->file = file; - page->prev->next = newpage; - page->prev = newpage; + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; - page = newpage; - } + page = newpage; + } out: - return page; + return page; } - void -ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +ra_wait_on_page(ra_page_t *page, call_frame_t *frame) { - ra_waitq_t *waitq = NULL; - ra_local_t *local = NULL; + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, page, out); + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); - local = frame->local; + local = frame->local; - waitq = GF_CALLOC (1, sizeof (*waitq), gf_ra_mt_ra_waitq_t); - if (!waitq) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } + waitq = GF_CALLOC(1, sizeof(*waitq), gf_ra_mt_ra_waitq_t); + if (!waitq) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } - waitq->data = frame; - waitq->next = page->waitq; - page->waitq = waitq; + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; - ra_local_lock (local); - { - local->wait_count++; - } - ra_local_unlock (local); + ra_local_lock(local); + { + local->wait_count++; + } + ra_local_unlock(local); out: - return; + return; } - void -ra_waitq_return (ra_waitq_t *waitq) +ra_waitq_return(ra_waitq_t *waitq) { - ra_waitq_t *trav = NULL; - ra_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ra_frame_return (frame); - GF_FREE (trav); - } + frame = trav->data; + ra_frame_return(frame); + GF_FREE(trav); + } - return; + return; } - int -ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) +ra_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - ra_local_t *local = NULL; - off_t pending_offset = 0; - ra_file_t *file = NULL; - ra_page_t *page = NULL; - ra_waitq_t *waitq = NULL; - fd_t *fd = NULL; - uint64_t tmp_file = 0; - - GF_ASSERT (frame); - - local = frame->local; - fd = local->fd; - - fd_ctx_get (fd, this, &tmp_file); - - file = (ra_file_t *)(long)tmp_file; - pending_offset = local->pending_offset; - - if (file == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "read-ahead context not set in fd (%p)", fd); - op_ret = -1; - op_errno = EBADF; - goto out; + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + uint64_t tmp_file = 0; + gf_boolean_t stale = _gf_false; + + GF_ASSERT(frame); + + local = frame->local; + fd = local->fd; + + fd_ctx_get(fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + + if (file == NULL) { + gf_msg(this->name, GF_LOG_WARNING, EBADF, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + "read-ahead context not set in fd (%p)", fd); + op_ret = -1; + op_errno = EBADF; + goto out; + } + + ra_file_lock(file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + page = ra_page_get(file, pending_offset); + + if (!page) { + gf_msg_trace(this->name, 0, + "wasted copy: " + "%" PRId64 "[+%" PRId64 "] file=%p", + pending_offset, file->page_size, file); + goto unlock; } - ra_file_lock (file); - { - if (op_ret >= 0) - file->stbuf = *stbuf; - - if (op_ret < 0) { - page = ra_page_get (file, pending_offset); - if (page) - waitq = ra_page_error (page, op_ret, op_errno); - goto unlock; - } - - page = ra_page_get (file, pending_offset); - if (!page) { - gf_log (this->name, GF_LOG_TRACE, - "wasted copy: %"PRId64"[+%"PRId64"] file=%p", - pending_offset, file->page_size, file); - goto unlock; - } - - if (page->vector) { - iobref_unref (page->iobref); - GF_FREE (page->vector); - } - - page->vector = iov_dup (vector, count); - if (page->vector == NULL) { - waitq = ra_page_error (page, -1, ENOMEM); - goto unlock; - } - - page->count = count; - page->iobref = iobref_ref (iobref); - page->ready = 1; - - page->size = iov_length (vector, count); - - waitq = ra_page_wakeup (page); + if (page->stale) { + page->stale = 0; + page->ready = 0; + stale = 1; + goto unlock; } -unlock: - ra_file_unlock (file); - - ra_waitq_return (waitq); - - fd_unref (local->fd); - GF_FREE (frame->local); - frame->local = NULL; - -out: - STACK_DESTROY (frame->root); - return 0; -} + /* + * "Dirty" means that the request was a pure read-ahead; it's + * set for requests we issue ourselves, and cleared when user + * requests are issued or put on the waitq. "Poisoned" means + * that we got a write while a read was still in flight, and we + * couldn't stop it so we marked it instead. If it's both + * dirty and poisoned by the time we get here, we cancel its + * effect so that a subsequent user read doesn't get data that + * we know is stale (because we made it stale ourselves). We + * can't use ESTALE because that has special significance. + * ECANCELED has no such special meaning, and is close to what + * we're trying to indicate. + */ + if (page->dirty && page->poisoned) { + op_ret = -1; + op_errno = ECANCELED; + } + if (op_ret < 0) { + waitq = ra_page_error(page, op_ret, op_errno); + goto unlock; + } -void -ra_page_fault (ra_file_t *file, call_frame_t *frame, off_t offset) -{ - call_frame_t *fault_frame = NULL; - ra_local_t *fault_local = NULL; - ra_page_t *page = NULL; - ra_waitq_t *waitq = NULL; - int32_t op_ret = -1, op_errno = -1; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - fault_frame = copy_frame (frame); - if (fault_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto err; + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); } - fault_local = GF_CALLOC (1, sizeof (ra_local_t), gf_ra_mt_ra_local_t); - if (fault_local == NULL) { - STACK_DESTROY (fault_frame->root); - op_ret = -1; - op_errno = ENOMEM; - goto err; + page->vector = iov_dup(vector, count); + if (page->vector == NULL) { + waitq = ra_page_error(page, -1, ENOMEM); + goto unlock; } - fault_frame->local = fault_local; - fault_local->pending_offset = offset; - fault_local->pending_size = file->page_size; + page->count = count; + page->iobref = iobref_ref(iobref); + page->ready = 1; - fault_local->fd = fd_ref (file->fd); + page->size = iov_length(vector, count); - STACK_WIND (fault_frame, ra_fault_cbk, - FIRST_CHILD (fault_frame->this), - FIRST_CHILD (fault_frame->this)->fops->readv, - file->fd, file->page_size, offset); + waitq = ra_page_wakeup(page); + } +unlock: + ra_file_unlock(file); - return; + if (stale) { + STACK_WIND(frame, ra_fault_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, local->fd, + local->pending_size, local->pending_offset, 0, NULL); -err: - ra_file_lock (file); - { - page = ra_page_get (file, offset); - if (page) - waitq = ra_page_error (page, op_ret, - op_errno); - } - ra_file_unlock (file); + return 0; + } - if (waitq != NULL) { - ra_waitq_return (waitq); - } + ra_waitq_return(waitq); + + fd_unref(local->fd); + + mem_put(frame->local); + frame->local = NULL; out: - return; + STACK_DESTROY(frame->root); + return 0; } - void -ra_frame_fill (ra_page_t *page, call_frame_t *frame) +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ra_fill_t *new = NULL; + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + int32_t op_ret = -1, op_errno = -1; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + fault_frame = copy_frame(frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_local = mem_get0(THIS->local_pool); + if (fault_local == NULL) { + STACK_DESTROY(fault_frame->root); + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref(file->fd); + + STACK_WIND(fault_frame, ra_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, file->fd, + file->page_size, offset, 0, NULL); + + return; - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, page, out); - - local = frame->local; - fill = &local->fill; - - if (local->op_ret != -1 && page->size) { - if (local->offset > page->offset) - src_offset = local->offset - page->offset; - else - dst_offset = page->offset - local->offset; - - copy_size = min (page->size - src_offset, - local->size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } - - fill = fill->next; - while (fill != &local->fill) { - if (fill->offset > page->offset) { - break; - } - fill = fill->next; - } - - new = GF_CALLOC (1, sizeof (*new), gf_ra_mt_ra_fill_t); - if (new == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - NULL); - new->vector = GF_CALLOC (new->count, sizeof (struct iovec), - gf_ra_mt_iovec); - if (new->vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - GF_FREE (new); - goto out; - } - - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - new->vector); - - new->next = fill; - new->prev = new->next->prev; - new->next->prev = new; - new->prev->next = new; - - local->op_ret += copy_size; - } +err: + ra_file_lock(file); + { + page = ra_page_get(file, offset); + if (page) + waitq = ra_page_error(page, op_ret, op_errno); + } + ra_file_unlock(file); + + if (waitq != NULL) { + ra_waitq_return(waitq); + } out: - return; + return; } - void -ra_frame_unwind (call_frame_t *frame) +ra_frame_fill(ra_page_t *page, call_frame_t *frame) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - int32_t count = 0; - struct iovec *vector = NULL; - int32_t copied = 0; - struct iobref *iobref = NULL; - ra_fill_t *next = NULL; - fd_t *fd = NULL; - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - - local = frame->local; - fill = local->fill.next; - - iobref = iobref_new (); - if (iobref == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min(page->size - src_offset, local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; } - frame->local = NULL; - + fill = fill->next; while (fill != &local->fill) { - count += fill->count; - fill = fill->next; + if (fill->offset > page->offset) { + break; + } + fill = fill->next; } - vector = GF_CALLOC (count, sizeof (*vector), gf_ra_mt_iovec); - if (vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - iobref_unref (iobref); - iobref = NULL; + new = GF_CALLOC(1, sizeof(*new), gf_ra_mt_ra_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref(page->iobref); + new->count = iov_subset(page->vector, page->count, src_offset, + copy_size, &new->vector, 0); + if (new->count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(new->iobref); + GF_FREE(new); + goto out; } - fill = local->fill.next; + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; - while (fill != &local->fill) { - next = fill->next; + local->op_ret += copy_size; + } - if ((vector != NULL) && (iobref != NULL)) { - memcpy (((char *)vector) + copied, fill->vector, - fill->count * sizeof (*vector)); +out: + return; +} - copied += (fill->count * sizeof (*vector)); - iobref_merge (iobref, fill->iobref); - } +void +ra_frame_unwind(call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + + local = frame->local; + fill = local->fill.next; + + iobref = iobref_new(); + if (iobref == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = GF_CALLOC(count, sizeof(*vector), gf_ra_mt_iovec); + if (vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + if ((vector != NULL) && (iobref != NULL)) { + memcpy(((char *)vector) + copied, fill->vector, + fill->count * sizeof(*vector)); + + copied += (fill->count * sizeof(*vector)); + if (iobref_merge(iobref, fill->iobref)) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + } - fill->next->prev = fill->prev; - fill->prev->next = fill->prev; + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; - iobref_unref (fill->iobref); - GF_FREE (fill->vector); - GF_FREE (fill); + iobref_unref(fill->iobref); + GF_FREE(fill->vector); + GF_FREE(fill); - fill = next; - } + fill = next; + } - fd = local->fd; - fd_ctx_get (fd, frame->this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + fd = local->fd; + fd_ctx_get(fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; - STACK_UNWIND_STRICT (readv, frame, local->op_ret, local->op_errno, - vector, count, &file->stbuf, iobref); + STACK_UNWIND_STRICT(readv, frame, local->op_ret, local->op_errno, vector, + count, &file->stbuf, iobref, NULL); - iobref_unref (iobref); - pthread_mutex_destroy (&local->local_lock); - GF_FREE (local); - GF_FREE (vector); + iobref_unref(iobref); + pthread_mutex_destroy(&local->local_lock); + mem_put(local); + GF_FREE(vector); out: - return; + return; } /* @@ -447,27 +458,27 @@ out: * */ void -ra_frame_return (call_frame_t *frame) +ra_frame_return(call_frame_t *frame) { - ra_local_t *local = NULL; - int32_t wait_count = 0; + ra_local_t *local = NULL; + int32_t wait_count = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); - local = frame->local; - GF_ASSERT (local->wait_count > 0); + local = frame->local; + GF_ASSERT(local->wait_count > 0); - ra_local_lock (local); - { - wait_count = --local->wait_count; - } - ra_local_unlock (local); + ra_local_lock(local); + { + wait_count = --local->wait_count; + } + ra_local_unlock(local); - if (!wait_count) - ra_frame_unwind (frame); + if (!wait_count) + ra_frame_unwind(frame); out: - return; + return; } /* @@ -476,23 +487,26 @@ out: * */ ra_waitq_t * -ra_page_wakeup (ra_page_t *page) +ra_page_wakeup(ra_page_t *page) { - ra_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ra_frame_fill (page, frame); - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill(page, frame); + } + if (page->stale) { + ra_page_purge(page); + } out: - return waitq; + return waitq; } /* @@ -501,22 +515,22 @@ out: * */ void -ra_page_purge (ra_page_t *page) +ra_page_purge(ra_page_t *page) { - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - page->prev->next = page->next; - page->next->prev = page->prev; + page->prev->next = page->next; + page->next->prev = page->prev; - if (page->iobref) { - iobref_unref (page->iobref); - } + if (page->iobref) { + iobref_unref(page->iobref); + } - GF_FREE (page->vector); - GF_FREE (page); + GF_FREE(page->vector); + GF_FREE(page); out: - return; + return; } /* @@ -527,32 +541,32 @@ out: * */ ra_waitq_t * -ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno) { - ra_waitq_t *waitq = NULL; - ra_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - ra_local_t *local = NULL; + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", page, out); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - local = frame->local; - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; } + } - ra_page_purge (page); + ra_page_purge(page); out: - return waitq; + return waitq; } /* @@ -561,31 +575,31 @@ out: * */ void -ra_file_destroy (ra_file_t *file) +ra_file_destroy(ra_file_t *file) { - ra_conf_t *conf = NULL; - ra_page_t *trav = NULL; + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", file, out); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - conf = file->conf; + conf = file->conf; - ra_conf_lock (conf); - { - file->prev->next = file->next; - file->next->prev = file->prev; - } - ra_conf_unlock (conf); + ra_conf_lock(conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock(conf); + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error(trav, -1, EINVAL); trav = file->pages.next; - while (trav != &file->pages) { - ra_page_error (trav, -1, EINVAL); - trav = file->pages.next; - } + } - pthread_mutex_destroy (&file->file_lock); - GF_FREE (file); + pthread_mutex_destroy(&file->file_lock); + GF_FREE(file); out: - return; + return; } diff --git a/xlators/performance/read-ahead/src/read-ahead-mem-types.h b/xlators/performance/read-ahead/src/read-ahead-mem-types.h index 7ca09369653..f07cfc5bba5 100644 --- a/xlators/performance/read-ahead/src/read-ahead-mem-types.h +++ b/xlators/performance/read-ahead/src/read-ahead-mem-types.h @@ -1,36 +1,25 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __RA_MEM_TYPES_H__ #define __RA_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_ra_mem_types_ { - gf_ra_mt_ra_file_t = gf_common_mt_end + 1, - gf_ra_mt_ra_local_t, - gf_ra_mt_ra_conf_t, - gf_ra_mt_ra_page_t, - gf_ra_mt_ra_waitq_t, - gf_ra_mt_ra_fill_t, - gf_ra_mt_iovec, - gf_ra_mt_end + gf_ra_mt_ra_file_t = gf_common_mt_end + 1, + gf_ra_mt_ra_conf_t, + gf_ra_mt_ra_page_t, + gf_ra_mt_ra_waitq_t, + gf_ra_mt_ra_fill_t, + gf_ra_mt_iovec, + gf_ra_mt_end }; #endif diff --git a/xlators/performance/read-ahead/src/read-ahead-messages.h b/xlators/performance/read-ahead/src/read-ahead-messages.h new file mode 100644 index 00000000000..0302b7a7122 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _READ_AHEAD_MESSAGES_H_ +#define _READ_AHEAD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(READ_AHEAD, READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + READ_AHEAD_MSG_VOL_MISCONFIGURED, READ_AHEAD_MSG_NO_MEMORY, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + READ_AHEAD_MSG_XLATOR_CONF_NULL); + +#endif /* _READ_AHEAD_MESSAGES_H_ */ diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index 37f34f2eb91..5246e1317d2 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* @@ -24,206 +15,187 @@ - ensure efficient memory management in case of random seek */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include <assert.h> #include <sys/time.h> +#include "read-ahead-messages.h" static void -read_ahead (call_frame_t *frame, ra_file_t *file); - +read_ahead(call_frame_t *frame, ra_file_t *file); int -ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +ra_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - long wbflags = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - wbflags = (long)frame->local; - - file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - if (wbflags & GF_OPEN_NOWB) { - file->disabled = 1; - } - - file->offset = (unsigned long long) 0; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - if (!file->disabled) { - file->page_count = 1; - } - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cannot set read-ahead context information in fd (%p)", - fd); - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(frame->this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read-ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } - int -ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) +ra_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = GF_CALLOC (1, sizeof (*file), gf_ra_mt_ra_file_t); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - //file->size = fd->inode->buf.ia_size; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set read ahead context information in fd (%p)", - fd); - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + // file->size = fd->inode->buf.ia_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int -ra_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +ra_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - GF_ASSERT (frame); - GF_ASSERT (this); + GF_ASSERT(frame); + GF_ASSERT(this); - frame->local = (void *)(long)wbflags; + STACK_WIND(frame, ra_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - STACK_WIND (frame, ra_open_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, - loc, flags, fd, wbflags); - - return 0; + return 0; } - int -ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd, dict_t *params) +ra_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - GF_ASSERT (frame); - GF_ASSERT (this); + GF_ASSERT(frame); + GF_ASSERT(this); - STACK_WIND (frame, ra_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); + STACK_WIND(frame, ra_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); - return 0; + return 0; } /* free cache pages between offset and offset+size, @@ -231,898 +203,1070 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ static void -flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size) +flush_region(call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, + int for_write) { - ra_page_t *trav = NULL; - ra_page_t *next = NULL; - - ra_file_lock (file); - { - trav = file->pages.next; - while (trav != &file->pages - && trav->offset < (offset + size)) { - - next = trav->next; - if (trav->offset >= offset && !trav->waitq) { - ra_page_purge (trav); - } - trav = next; + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + ra_file_lock(file); + { + trav = file->pages.next; + while (trav != &file->pages && trav->offset < (offset + size)) { + next = trav->next; + if (trav->offset >= offset) { + if (!trav->waitq) { + ra_page_purge(trav); + } else { + trav->stale = 1; + + if (for_write) { + trav->poisoned = 1; + } } + } + trav = next; } - ra_file_unlock (file); + } + ra_file_unlock(file); } - int -ra_release (xlator_t *this, fd_t *fd) +ra_release(xlator_t *this, fd_t *fd) { - uint64_t tmp_file = 0; - int ret = 0; + uint64_t tmp_file = 0; + int ret = 0; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - ret = fd_ctx_del (fd, this, &tmp_file); + ret = fd_ctx_del(fd, this, &tmp_file); - if (!ret) { - ra_file_destroy ((ra_file_t *)(long)tmp_file); - } + if (!ret) { + ra_file_destroy((ra_file_t *)(long)tmp_file); + } out: - return 0; + return 0; } - void -read_ahead (call_frame_t *frame, ra_file_t *file) +read_ahead(call_frame_t *frame, ra_file_t *file) { - off_t ra_offset = 0; - size_t ra_size = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - off_t cap = 0; - char fault = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - if (!file->page_count) { - goto out; + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + if (!file->page_count) { + goto out; + } + + ra_size = file->page_size * file->page_count; + ra_offset = gf_floor(file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min(file->offset + ra_size, cap)) { + ra_file_lock(file); + { + trav = ra_page_get(file, ra_offset); } + ra_file_unlock(file); - ra_size = file->page_size * file->page_count; - ra_offset = floor (file->offset, file->page_size); - cap = file->size ? file->size : file->offset + ra_size; + if (!trav) + break; - while (ra_offset < min (file->offset + ra_size, cap)) { + ra_offset += file->page_size; + } - ra_file_lock (file); - { - trav = ra_page_get (file, ra_offset); - } - ra_file_unlock (file); + if (trav) { + /* comfortable enough */ + goto out; + } - if (!trav) - break; + trav_offset = ra_offset; - ra_offset += file->page_size; - } + cap = file->size ? file->size : ra_offset + ra_size; - if (trav) { - /* comfortable enough */ - goto out; + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create(file, trav_offset); + if (trav) + trav->dirty = 1; + } } + ra_file_unlock(file); - trav_offset = ra_offset; - - cap = file->size ? file->size : ra_offset + ra_size; - - while (trav_offset < min(ra_offset + ra_size, cap)) { - fault = 0; - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - fault = 1; - trav = ra_page_create (file, trav_offset); - if (trav) - trav->dirty = 1; - } - } - ra_file_unlock (file); - - if (!trav) { - /* OUT OF MEMORY */ - break; - } + if (!trav) { + /* OUT OF MEMORY */ + break; + } - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "RA at offset=%"PRId64, trav_offset); - ra_page_fault (file, frame, trav_offset); - } - trav_offset += file->page_size; + if (fault) { + gf_msg_trace(frame->this->name, 0, "RA at offset=%" PRId64, + trav_offset); + ra_page_fault(file, frame, trav_offset); } + trav_offset += file->page_size; + } out: - return; + return; } - int -ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) +ra_need_atime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - GF_ASSERT (frame); - STACK_DESTROY (frame->root); - return 0; + GF_ASSERT(frame); + STACK_DESTROY(frame->root); + return 0; } - static void -dispatch_requests (call_frame_t *frame, ra_file_t *file) +dispatch_requests(call_frame_t *frame, ra_file_t *file) { - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - call_frame_t *ra_frame = NULL; - char need_atime_update = 1; - char fault = 0; - - GF_VALIDATE_OR_GOTO ("read-ahead", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - - local = frame->local; - conf = file->conf; - - rounded_offset = floor (local->offset, file->page_size); - rounded_end = roof (local->offset + local->size, file->page_size); - - trav_offset = rounded_offset; - - while (trav_offset < rounded_end) { - fault = 0; - - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - trav = ra_page_create (file, trav_offset); - fault = 1; - need_atime_update = 0; - } - - if (!trav) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - - if (trav->ready) { - gf_log (frame->this->name, GF_LOG_TRACE, - "HIT at offset=%"PRId64".", - trav_offset); - ra_frame_fill (trav, frame); - } else { - gf_log (frame->this->name, GF_LOG_TRACE, - "IN-TRANSIT at offset=%"PRId64".", - trav_offset); - ra_wait_on_page (trav, frame); - need_atime_update = 0; - } - } - unlock: - ra_file_unlock (file); + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; - if (local->op_ret == -1) { - goto out; - } + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + local = frame->local; + conf = file->conf; + + rounded_offset = gf_floor(local->offset, file->page_size); + rounded_end = gf_roof(local->offset + local->size, file->page_size); - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "MISS at offset=%"PRId64".", - trav_offset); - ra_page_fault (file, frame, trav_offset); + trav_offset = rounded_offset; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + trav = ra_page_create(file, trav_offset); + if (!trav) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + fault = 1; + need_atime_update = 0; + } + trav->dirty = 0; + + if (trav->ready) { + gf_msg_trace(frame->this->name, 0, "HIT at offset=%" PRId64 ".", + trav_offset); + ra_frame_fill(trav, frame); + } else { + gf_msg_trace(frame->this->name, 0, + "IN-TRANSIT at " + "offset=%" PRId64 ".", + trav_offset); + ra_wait_on_page(trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock(file); - trav_offset += file->page_size; + if (local->op_ret == -1) { + goto out; } - if (need_atime_update && conf->force_atime_update) { - /* TODO: use untimens() since readv() can confuse underlying - io-cache and others */ - ra_frame = copy_frame (frame); - if (ra_frame == NULL) { - goto out; - } + if (fault) { + gf_msg_trace(frame->this->name, 0, "MISS at offset=%" PRId64 ".", + trav_offset); + ra_page_fault(file, frame, trav_offset); + } + + trav_offset += file->page_size; + } - STACK_WIND (ra_frame, ra_need_atime_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, 1, 1); + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame(frame); + if (ra_frame == NULL) { + goto out; } + STACK_WIND(ra_frame, ra_need_atime_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, file->fd, 1, 1, 0, + NULL); + } + out: - return ; + return; } - int -ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct iatt *stbuf, struct iobref *iobref) +ra_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); - return 0; + return 0; } - int -ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +ra_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - ra_file_t *file = NULL; - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - int op_errno = EINVAL; - char expected_offset = 1; - uint64_t tmp_file = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - conf = this->private; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", - offset, size); - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file || file->disabled) { - goto disabled; - } - - if (file->offset != offset) { - gf_log (this->name, GF_LOG_TRACE, - "unexpected offset (%"PRId64" != %"PRId64") resetting", - file->offset, offset); - - expected_offset = file->expected = file->page_count = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "expected offset (%"PRId64") when page_count=%d", - offset, file->page_count); - - if (file->expected < (conf->page_size * conf->page_count)) { - file->expected += size; - file->page_count = min ((file->expected - / file->page_size), - conf->page_count); - } + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = EINVAL; + char expected_offset = 1; + uint64_t tmp_file = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + conf = this->private; + + gf_msg_trace(this->name, 0, + "NEW REQ at offset=%" PRId64 " for size=%" GF_PRI_SIZET "", + offset, size); + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file || file->disabled) { + goto disabled; + } + + if (file->offset != offset) { + gf_msg_trace(this->name, 0, + "unexpected offset (%" PRId64 " != %" PRId64 + ") " + "resetting", + file->offset, offset); + + expected_offset = file->expected = file->page_count = 0; + } else { + gf_msg_trace(this->name, 0, + "expected offset (%" PRId64 ") when page_count=%d", offset, + file->page_count); + + if (file->expected < (file->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min((file->expected / file->page_size), + conf->page_count); } + } - if (!expected_offset) { - flush_region (frame, file, 0, file->pages.prev->offset + 1); - } + if (!expected_offset) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } - local = (void *) GF_CALLOC (1, sizeof (*local), gf_ra_mt_ra_local_t); - if (!local) { - op_errno = ENOMEM; - goto unwind; - } + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } - local->fd = fd; - local->offset = offset; - local->size = size; - local->wait_count = 1; + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; - local->fill.next = &local->fill; - local->fill.prev = &local->fill; + local->fill.next = &local->fill; + local->fill.prev = &local->fill; - pthread_mutex_init (&local->local_lock, NULL); + pthread_mutex_init(&local->local_lock, NULL); - frame->local = local; + frame->local = local; - dispatch_requests (frame, file); + dispatch_requests(frame, file); - flush_region (frame, file, 0, floor (offset, file->page_size)); + flush_region(frame, file, 0, gf_floor(offset, file->page_size), 0); - read_ahead (frame, file); + read_ahead(frame, file); - ra_frame_return (frame); + file->offset = offset + size; - file->offset = offset + size; + ra_frame_return(frame); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; disabled: - STACK_WIND (frame, ra_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - fd, size, offset); - return 0; + STACK_WIND(frame, ra_readv_disabled_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; } - int -ra_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +ra_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - GF_ASSERT (frame); - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); - return 0; + GF_ASSERT(frame); + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata); + return 0; } - - int -ra_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf) +ra_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - GF_ASSERT (frame); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + GF_ASSERT(frame); + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } - int -ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +ra_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + int32_t op_errno = EINVAL; - fd_ctx_get (fd, this, &tmp_file); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } - - STACK_WIND (frame, ra_flush_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, fd); - return 0; + STACK_WIND(frame, ra_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (flush, frame, -1, op_errno); - return 0; + STACK_UNWIND_STRICT(flush, frame, -1, op_errno, NULL); + return 0; } - int -ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) +ra_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + int32_t op_errno = EINVAL; - fd_ctx_get (fd, this, &tmp_file); - - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - STACK_WIND (frame, ra_fsync_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, fd, datasync); - return 0; + STACK_WIND(frame, ra_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +ra_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - ra_file_t *file = NULL; + ra_file_t *file = NULL; - GF_ASSERT (frame); + GF_ASSERT(frame); - file = frame->local; + file = frame->local; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } + if (file) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + } - frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; + frame->local = NULL; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int -ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, struct iobref *iobref) +ra_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - ra_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + + if (iter_fd == fd) frame->local = file; - /* reset the read-ahead counters too */ - file->expected = file->page_count = 0; + + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; } + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, iobref); + STACK_WIND(frame, ra_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +ra_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - int -ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) +ra_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - GF_ASSERT (frame); + GF_ASSERT(frame); - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf); - return 0; + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; } - int -ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +ra_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, loc, unwind); - inode = loc->inode; + inode = loc->inode; - LOCK (&inode->lock); + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_ftruncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_truncate_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - loc, offset); - return 0; + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - void -ra_page_dump (struct ra_page *page) +ra_page_dump(struct ra_page *page) { - int i = 0; - call_frame_t *frame = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - ra_waitq_t *trav = NULL; + int i = 0; + call_frame_t *frame = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + ra_waitq_t *trav = NULL; - if (page == NULL) { - goto out; - } + if (page == NULL) { + goto out; + } - gf_proc_dump_write ("offset", "%"PRId64, page->offset); + gf_proc_dump_write("offset", "%" PRId64, page->offset); - gf_proc_dump_write ("size", "%"PRId64, page->size); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size); - gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no"); - gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); + gf_proc_dump_write("poisoned", "%s", page->poisoned ? "yes" : "no"); - for (trav = page->waitq; trav; trav = trav->next) { - frame = trav->data; - sprintf (key, "waiting-frame[%d]", i++); - gf_proc_dump_write (key, "%"PRId64, frame->root->unique); - } + gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no"); + + for (trav = page->waitq; trav; trav = trav->next) { + frame = trav->data; + sprintf(key, "waiting-frame[%d]", i++); + gf_proc_dump_write(key, "%" PRId64, frame->root->unique); + } out: - return; + return; } int32_t -ra_fdctx_dump (xlator_t *this, fd_t *fd) +ra_fdctx_dump(xlator_t *this, fd_t *fd) { - ra_file_t *file = NULL; - ra_page_t *page = NULL; - int32_t ret = 0, i = 0; - uint64_t tmp_file = 0; - char *path = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - - fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (file == NULL) { - ret = 0; - goto out; - } + ra_file_t *file = NULL; + ra_page_t *page = NULL; + int32_t ret = 0, i = 0; + uint64_t tmp_file = 0; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file == NULL) { + ret = 0; + goto out; + } - gf_proc_dump_build_key (key_prefix, - "xlator.performance.read-ahead", - "file"); + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "file"); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_add_section("%s", key_prefix); - ret = __inode_path (fd->inode, NULL, &path); - if (path != NULL) { - gf_proc_dump_write ("path", "%s", path); - GF_FREE (path); - } + ret = __inode_path(fd->inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } - gf_proc_dump_write ("fd", "%p", fd); + gf_proc_dump_write("fd", "%p", fd); - gf_proc_dump_write ("disabled", "%s", file->disabled ? "yes" : "no"); + gf_proc_dump_write("disabled", "%s", file->disabled ? "yes" : "no"); - if (file->disabled) { - ret = 0; - goto out; - } + if (file->disabled) { + ret = 0; + goto out; + } - gf_proc_dump_write ("page-size", "%"PRId64, file->page_size); + gf_proc_dump_write("page-size", "%" PRId64, file->page_size); - gf_proc_dump_write ("page-count", "%u", file->page_count); + gf_proc_dump_write("page-count", "%u", file->page_count); - gf_proc_dump_write ("next-expected-offset-for-sequential-reads", - "%"PRId64, file->offset); + gf_proc_dump_write("next-expected-offset-for-sequential-reads", "%" PRId64, + file->offset); - for (page = file->pages.next; page != &file->pages; - page = page->next) { - sprintf (key, "page[%d]", i); - gf_proc_dump_write (key, "%p", page[i++]); - ra_page_dump (page); - } + for (page = file->pages.next; page != &file->pages; page = page->next) { + gf_proc_dump_write("page", "%d: %p", i++, (void *)page); + ra_page_dump(page); + } - ret = 0; + ret = 0; out: - return ret; + return ret; } int -ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +ra_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + ra_conf_t *conf = NULL; + + conf = this->private; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - inode = fd->inode; + inode = fd->inode; - LOCK (&inode->lock); + if (conf->force_atime_update) { + LOCK(&inode->lock); { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } } - UNLOCK (&inode->lock); + UNLOCK(&inode->lock); + } - STACK_WIND (frame, ra_attr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, fd); - return 0; + STACK_WIND(frame, ra_attr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL); - return 0; + STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, NULL); + return 0; } - int -ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +ra_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - uint64_t tmp_file = 0; - int32_t op_errno = EINVAL; + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - inode = fd->inode; + inode = fd->inode; - LOCK (&inode->lock); + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_truncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); } - UNLOCK (&inode->lock); + } + UNLOCK(&inode->lock); - STACK_WIND (frame, ra_truncate_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, fd, offset); - return 0; + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_priv_dump (xlator_t *this) +ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - ra_conf_t *conf = NULL; - int ret = -1; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + GF_ASSERT(frame); - if (!this) { - goto out; - } + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - conf = this->private; - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, "conf null in xlator"); - goto out; - } +static int +ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; - ret = pthread_mutex_trylock (&conf->conf_lock); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Unable to lock client %s " - "(%s)", this->name, strerror (ret)); - ret = -1; - goto out; + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); } + } + UNLOCK(&inode->lock); - gf_proc_dump_build_key (key_prefix, "xlator.performance.read-ahead", - "priv"); + STACK_WIND(frame, ra_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; - gf_proc_dump_add_section (key_prefix); - gf_proc_dump_write ("page_size", "%d", conf->page_size); - gf_proc_dump_write ("page_count", "%d", conf->page_count); - gf_proc_dump_write ("force_atime_update", "%d", conf->force_atime_update); +unwind: + STACK_UNWIND_STRICT(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - pthread_mutex_unlock (&conf->conf_lock); +int +ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT(frame); - ret = 0; -out: - return ret; + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +static int +ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK(&inode->lock); + + STACK_WIND(frame, ra_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +ra_priv_dump(xlator_t *this) +{ + ra_conf_t *conf = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + if (!this) { + goto out; + } + + conf = this->private; + if (!conf) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_XLATOR_CONF_NULL, + "conf null in xlator"); + goto out; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "priv"); + + gf_proc_dump_add_section("%s", key_prefix); + + ret = pthread_mutex_trylock(&conf->conf_lock); + if (ret) + goto out; + { + gf_proc_dump_write("page_size", "%" PRIu64, conf->page_size); + gf_proc_dump_write("page_count", "%d", conf->page_count); + gf_proc_dump_write("force_atime_update", "%d", + conf->force_atime_update); + } + pthread_mutex_unlock(&conf->conf_lock); + + ret = 0; +out: + if (ret && conf) { + gf_proc_dump_write("Unable to dump priv", + "(Lock acquisition failed) %s", this->name); + } + return ret; +} int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - if (!this) { - goto out; - } + if (!this) { + goto out; + } - ret = xlator_mem_acct_init (this, gf_ra_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_ra_mt_end + 1); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - } + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + } out: - return ret; + return ret; } int -init (xlator_t *this) +reconfigure(xlator_t *this, dict_t *options) { - ra_conf_t *conf = NULL; - dict_t *options = NULL; - char *page_count_string = NULL; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); - - options = this->options; - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: read-ahead not configured with exactly one" - " child"); - goto out; - } + ra_conf_t *conf = NULL; + int ret = -1; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this->private, out); - conf = (void *) GF_CALLOC (1, sizeof (*conf), gf_ra_mt_ra_conf_t); - if (conf == NULL) { - goto out; - } + conf = this->private; - conf->page_size = this->ctx->page_size; - conf->page_count = 4; + GF_OPTION_RECONF("page-count", conf->page_count, options, uint32, out); - if (dict_get (options, "page-count")) { - page_count_string = data_to_str (dict_get (options, - "page-count")); - } + GF_OPTION_RECONF("page-size", conf->page_size, options, size_uint64, out); - if (page_count_string) { - if (gf_string2uint_base10 (page_count_string, &conf->page_count) - != 0) { - gf_log ("read-ahead", GF_LOG_ERROR, - "invalid number format \"%s\" of \"option " - "page-count\"", - page_count_string); - goto out; - } + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); - gf_log (this->name, GF_LOG_WARNING, - "Using conf->page_count = %u", conf->page_count); - } + ret = 0; +out: + return ret; +} - if (dict_get (options, "force-atime-update")) { - char *force_atime_update_str = NULL; +int +init(xlator_t *this) +{ + ra_conf_t *conf = NULL; + int32_t ret = -1; - force_atime_update_str - = data_to_str (dict_get (options, - "force-atime-update")); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); - if (gf_string2boolean (force_atime_update_str, - &conf->force_atime_update) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'force-atime-update' takes only boolean " - "options"); - goto out; - } + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: read-ahead not configured with exactly one" + " child"); + goto out; + } - if (conf->force_atime_update) { - gf_log (this->name, GF_LOG_WARNING, "Forcing atime " - "updates on cache hit"); - } - } + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } - conf->files.next = &conf->files; - conf->files.prev = &conf->files; + conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_ra_mt_ra_conf_t); + if (conf == NULL) { + goto out; + } - pthread_mutex_init (&conf->conf_lock, NULL); - this->private = conf; - ret = 0; + conf->page_size = this->ctx->page_size; + + GF_OPTION_INIT("page-size", conf->page_size, size_uint64, out); + + GF_OPTION_INIT("page-count", conf->page_count, uint32, out); + + GF_OPTION_INIT("force-atime-update", conf->force_atime_update, bool, out); + + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); + + conf->files.next = &conf->files; + conf->files.prev = &conf->files; + + pthread_mutex_init(&conf->conf_lock, NULL); + + this->local_pool = mem_pool_new(ra_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = conf; + ret = 0; out: - if (ret == -1) { - if (conf != NULL) { - GF_FREE (conf); - } - } + if (ret == -1) { + GF_FREE(conf); + } - return ret; + return ret; } - void -fini (xlator_t *this) +fini(xlator_t *this) { - ra_conf_t *conf = NULL; + ra_conf_t *conf = NULL; - GF_VALIDATE_OR_GOTO ("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this, out); - conf = this->private; - if (conf == NULL) { - goto out; - } + conf = this->private; + if (conf == NULL) { + goto out; + } + + this->private = NULL; - pthread_mutex_destroy (&conf->conf_lock); - GF_FREE (conf); + /* The files structures allocated in open and create are not deleted. + * until that is freed, marking the below assert as warning. + GF_ASSERT ((conf->files.next == &conf->files) + && (conf->files.prev == &conf->files)); + */ + if (!((conf->files.next == &conf->files) && + (conf->files.prev == &conf->files))) { + gf_msg(this->name, GF_LOG_INFO, 0, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + "undestroyed read ahead file structures found"); + } - this->private = NULL; + pthread_mutex_destroy(&conf->conf_lock); + GF_FREE(conf); out: - return; + return; } struct xlator_fops fops = { - .open = ra_open, - .create = ra_create, - .readv = ra_readv, - .writev = ra_writev, - .flush = ra_flush, - .fsync = ra_fsync, - .truncate = ra_truncate, - .ftruncate = ra_ftruncate, - .fstat = ra_fstat, + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .discard = ra_discard, + .zerofill = ra_zerofill, }; struct xlator_cbks cbks = { - .release = ra_release, + .release = ra_release, }; struct xlator_dumpops dumpops = { - .priv = ra_priv_dump, - .fdctx = ra_fdctx_dump, + .priv = ra_priv_dump, + .fdctx = ra_fdctx_dump, }; struct volume_options options[] = { - { .key = {"force-atime-update"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"page-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 16 - }, - { .key = {NULL} }, + { + .key = {"read-ahead"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable read-ahead", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {1}, + .tags = {"read-ahead"}, + .default_value = "false"}, + {.key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16, + .default_value = "4", + .op_version = {1}, + .tags = {"read-ahead"}, + .description = "Number of pages that will be pre-fetched"}, + {.key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 1048576 * 64, + .default_value = "131072", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Page size with which read-ahead performs server I/O"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Enable/Disable read ahead translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "read-ahead", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h index d0bbcde810f..e9432fb47cc 100644 --- a/xlators/performance/read-ahead/src/read-ahead.h +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -1,36 +1,21 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __READ_AHEAD_H #define __READ_AHEAD_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/common-utils.h> #include "read-ahead-mem-types.h" struct ra_conf; @@ -39,82 +24,77 @@ struct ra_page; struct ra_file; struct ra_waitq; - struct ra_waitq { - struct ra_waitq *next; - void *data; + struct ra_waitq *next; + void *data; }; - struct ra_fill { - struct ra_fill *next; - struct ra_fill *prev; - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; - struct ra_local { - mode_t mode; - struct ra_fill fill; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - off_t pending_offset; - size_t pending_size; - fd_t *fd; - int32_t wait_count; - pthread_mutex_t local_lock; + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; }; - struct ra_page { - struct ra_page *next; - struct ra_page *prev; - struct ra_file *file; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ra_waitq *waitq; - struct iobref *iobref; + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; /* Internal request, not from user. */ + char poisoned; /* Pending read invalidated by write. */ + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; + struct iobref *iobref; + char stale; }; - struct ra_file { - struct ra_file *next; - struct ra_file *prev; - struct ra_conf *conf; - fd_t *fd; - int disabled; - size_t expected; - struct ra_page pages; - off_t offset; - size_t size; - int32_t refcount; - pthread_mutex_t file_lock; - struct iatt stbuf; - uint64_t page_size; - uint32_t page_count; + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct iatt stbuf; + uint64_t page_size; + uint32_t page_count; }; - struct ra_conf { - uint64_t page_size; - uint32_t page_count; - void *cache_block; - struct ra_file files; - gf_boolean_t force_atime_update; - pthread_mutex_t conf_lock; + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; }; - typedef struct ra_conf ra_conf_t; typedef struct ra_local ra_local_t; typedef struct ra_page ra_page_t; @@ -123,77 +103,69 @@ typedef struct ra_waitq ra_waitq_t; typedef struct ra_fill ra_fill_t; ra_page_t * -ra_page_get (ra_file_t *file, - off_t offset); +ra_page_get(ra_file_t *file, off_t offset); ra_page_t * -ra_page_create (ra_file_t *file, - off_t offset); +ra_page_create(ra_file_t *file, off_t offset); void -ra_page_fault (ra_file_t *file, - call_frame_t *frame, - off_t offset); +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset); void -ra_wait_on_page (ra_page_t *page, - call_frame_t *frame); +ra_wait_on_page(ra_page_t *page, call_frame_t *frame); ra_waitq_t * -ra_page_wakeup (ra_page_t *page); +ra_page_wakeup(ra_page_t *page); void -ra_page_flush (ra_page_t *page); +ra_page_flush(ra_page_t *page); ra_waitq_t * -ra_page_error (ra_page_t *page, - int32_t op_ret, - int32_t op_errno); +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno); void -ra_page_purge (ra_page_t *page); +ra_page_purge(ra_page_t *page); void -ra_frame_return (call_frame_t *frame); +ra_frame_return(call_frame_t *frame); void -ra_frame_fill (ra_page_t *page, - call_frame_t *frame); +ra_frame_fill(ra_page_t *page, call_frame_t *frame); void -ra_file_destroy (ra_file_t *file); +ra_file_destroy(ra_file_t *file); static inline void -ra_file_lock (ra_file_t *file) +ra_file_lock(ra_file_t *file) { - pthread_mutex_lock (&file->file_lock); + pthread_mutex_lock(&file->file_lock); } static inline void -ra_file_unlock (ra_file_t *file) +ra_file_unlock(ra_file_t *file) { - pthread_mutex_unlock (&file->file_lock); + pthread_mutex_unlock(&file->file_lock); } static inline void -ra_conf_lock (ra_conf_t *conf) +ra_conf_lock(ra_conf_t *conf) { - pthread_mutex_lock (&conf->conf_lock); + pthread_mutex_lock(&conf->conf_lock); } static inline void -ra_conf_unlock (ra_conf_t *conf) +ra_conf_unlock(ra_conf_t *conf) { - pthread_mutex_unlock (&conf->conf_lock); + pthread_mutex_unlock(&conf->conf_lock); } static inline void -ra_local_lock (ra_local_t *local) +ra_local_lock(ra_local_t *local) { - pthread_mutex_lock (&local->local_lock); + pthread_mutex_lock(&local->local_lock); } static inline void -ra_local_unlock (ra_local_t *local) +ra_local_unlock(ra_local_t *local) { - pthread_mutex_unlock (&local->local_lock); + pthread_mutex_unlock(&local->local_lock); } #endif /* __READ_AHEAD_H */ diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/performance/readdir-ahead/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am new file mode 100644 index 00000000000..3d6b6ae951f --- /dev/null +++ b/xlators/performance/readdir-ahead/src/Makefile.am @@ -0,0 +1,18 @@ +xlator_LTLIBRARIES = readdir-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +readdir_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +readdir_ahead_la_SOURCES = readdir-ahead.c +readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h \ + readdir-ahead-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h new file mode 100644 index 00000000000..498ffae7f64 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RDA_MEM_TYPES_H__ +#define __RDA_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_rda_mem_types_ { + gf_rda_mt_rda_local = gf_common_mt_end + 1, + gf_rda_mt_rda_fd_ctx, + gf_rda_mt_rda_priv, + gf_rda_mt_inode_ctx_t, + gf_rda_mt_end +}; + +#endif diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h new file mode 100644 index 00000000000..28ec14dd845 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h @@ -0,0 +1,30 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _READDIR_AHEAD_MESSAGES_H_ +#define _READDIR_AHEAD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(READDIR_AHEAD, READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + READDIR_AHEAD_MSG_VOL_MISCONFIGURED, READDIR_AHEAD_MSG_NO_MEMORY, + READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB, + READDIR_AHEAD_MSG_OUT_OF_SEQUENCE, READDIR_AHEAD_MSG_DICT_OP_FAILED); + +#endif /* _READDIR_AHEAD_MESSAGES_H_ */ diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c new file mode 100644 index 00000000000..4ba7ee7077a --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c @@ -0,0 +1,1382 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* + * performance/readdir-ahead preloads a local buffer with directory entries + * on opendir. The optimization involves using maximum sized gluster rpc + * requests (128k) to minimize overhead of smaller client requests. + * + * For example, fuse currently supports a maximum readdir buffer of 4k + * (regardless of the filesystem client's buffer size). readdir-ahead should + * effectively convert these smaller requests into fewer, larger sized requests + * for simple, sequential workloads (i.e., ls). + * + * The translator is currently designed to handle the simple, sequential case + * only. If a non-sequential directory read occurs, readdir-ahead disables + * preloads on the directory. + */ + +#include <math.h> +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> +#include <glusterfs/call-stub.h> +#include "readdir-ahead.h" +#include "readdir-ahead-mem-types.h" +#include <glusterfs/defaults.h> +#include "readdir-ahead-messages.h" +static int +rda_fill_fd(call_frame_t *, xlator_t *, fd_t *); + +static void +rda_local_wipe(struct rda_local *local) +{ + if (local->fd) + fd_unref(local->fd); + if (local->xattrs) + dict_unref(local->xattrs); + if (local->inode) + inode_unref(local->inode); +} + +/* + * Get (or create) the fd context for storing prepopulated directory + * entries. + */ +static struct rda_fd_ctx * +get_rda_fd_ctx(fd_t *fd, xlator_t *this) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + LOCK(&fd->lock); + + if (__fd_ctx_get(fd, this, &val) < 0) { + ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx), gf_rda_mt_rda_fd_ctx); + if (!ctx) + goto out; + + LOCK_INIT(&ctx->lock); + INIT_LIST_HEAD(&ctx->entries.list); + ctx->state = RDA_FD_NEW; + /* ctx offset values initialized to 0 */ + ctx->xattrs = NULL; + + if (__fd_ctx_set(fd, this, (uint64_t)(uintptr_t)ctx) < 0) { + GF_FREE(ctx); + ctx = NULL; + goto out; + } + } else { + ctx = (struct rda_fd_ctx *)(uintptr_t)val; + } +out: + UNLOCK(&fd->lock); + return ctx; +} + +static rda_inode_ctx_t * +__rda_inode_ctx_get(inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + rda_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get1(inode, this, &ctx_uint); + if (ret == 0) + return (rda_inode_ctx_t *)(uintptr_t)ctx_uint; + + ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_rda_mt_inode_ctx_t); + if (!ctx_p) + return NULL; + + GF_ATOMIC_INIT(ctx_p->generation, 0); + + ctx_uint = (uint64_t)(uintptr_t)ctx_p; + ret = __inode_ctx_set1(inode, this, &ctx_uint); + if (ret < 0) { + GF_FREE(ctx_p); + return NULL; + } + + return ctx_p; +} + +static int +__rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this, + struct iatt *stbuf_in, struct iatt *stbuf_out, + uint64_t generation) +{ + rda_inode_ctx_t *ctx_p = NULL; + struct iatt tmp_stat = { + 0, + }; + + ctx_p = __rda_inode_ctx_get(inode, this); + if (!ctx_p) + return -1; + + if ((!stbuf_in) || (stbuf_in->ia_ctime == 0)) { + /* A fop modified a file but valid stbuf is not provided. + * Can't update iatt to reflect results of fop and hence + * invalidate the iatt stored in dentry. + * + * An example of this case can be response of write request + * that is cached in write-behind. + */ + if (stbuf_in) + tmp_stat = *stbuf_in; + else + tmp_stat = ctx_p->statbuf; + memset(&ctx_p->statbuf, 0, sizeof(ctx_p->statbuf)); + gf_uuid_copy(ctx_p->statbuf.ia_gfid, tmp_stat.ia_gfid); + ctx_p->statbuf.ia_type = tmp_stat.ia_type; + GF_ATOMIC_INC(ctx_p->generation); + } else { + if (ctx_p->statbuf.ia_ctime) { + if (stbuf_in->ia_ctime < ctx_p->statbuf.ia_ctime) { + goto out; + } + + if ((stbuf_in->ia_ctime == ctx_p->statbuf.ia_ctime) && + (stbuf_in->ia_ctime_nsec < ctx_p->statbuf.ia_ctime_nsec)) { + goto out; + } + } else { + if ((generation != -1) && + (generation != GF_ATOMIC_GET(ctx_p->generation))) + goto out; + } + + ctx_p->statbuf = *stbuf_in; + } + +out: + if (stbuf_out) + *stbuf_out = ctx_p->statbuf; + + return 0; +} + +static int +rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this, + struct iatt *stbuf_in, struct iatt *stbuf_out, + uint64_t generation) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __rda_inode_ctx_update_iatts(inode, this, stbuf_in, stbuf_out, + generation); + } + UNLOCK(&inode->lock); + + return ret; +} + +/* + * Reset the tracking state of the context. + */ +static void +rda_reset_ctx(xlator_t *this, struct rda_fd_ctx *ctx) +{ + struct rda_priv *priv = NULL; + + priv = this->private; + + ctx->state = RDA_FD_NEW; + ctx->cur_offset = 0; + ctx->next_offset = 0; + ctx->op_errno = 0; + + gf_dirent_free(&ctx->entries); + GF_ATOMIC_SUB(priv->rda_cache_size, ctx->cur_size); + ctx->cur_size = 0; + + if (ctx->xattrs) { + dict_unref(ctx->xattrs); + ctx->xattrs = NULL; + } +} + +static void +rda_mark_inode_dirty(xlator_t *this, inode_t *inode) +{ + inode_t *parent = NULL; + fd_t *fd = NULL; + uint64_t val = 0; + int32_t ret = 0; + struct rda_fd_ctx *fd_ctx = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + parent = inode_parent(inode, NULL, NULL); + if (parent) { + LOCK(&parent->lock); + { + list_for_each_entry(fd, &parent->fd_list, inode_list) + { + val = 0; + fd_ctx_get(fd, this, &val); + if (val == 0) + continue; + + fd_ctx = (void *)(uintptr_t)val; + uuid_utoa_r(inode->gfid, gfid); + if (!GF_ATOMIC_GET(fd_ctx->prefetching)) + continue; + + LOCK(&fd_ctx->lock); + { + if (GF_ATOMIC_GET(fd_ctx->prefetching)) { + if (fd_ctx->writes_during_prefetch == NULL) + fd_ctx->writes_during_prefetch = dict_new(); + + ret = dict_set_int8(fd_ctx->writes_during_prefetch, + gfid, 1); + if (ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "marking to invalidate stats of %s from an " + "in progress " + "prefetching has failed, might result in " + "stale stat to " + "application", + gfid); + } + } + } + UNLOCK(&fd_ctx->lock); + } + } + UNLOCK(&parent->lock); + inode_unref(parent); + } + + return; +} + +/* + * Check whether we can handle a request. Offset verification is done by the + * caller, so we only check whether the preload buffer has completion status + * (including an error) or has some data to return. + */ +static gf_boolean_t +rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size) +{ + if ((ctx->state & RDA_FD_EOD) || (ctx->state & RDA_FD_ERROR) || + (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)) || + (request_size && ctx->cur_size >= request_size)) + return _gf_true; + + return _gf_false; +} + +void +rda_inode_ctx_get_iatt(inode_t *inode, xlator_t *this, struct iatt *attr) +{ + rda_inode_ctx_t *ctx_p = NULL; + + if (!inode || !this || !attr) + goto out; + + LOCK(&inode->lock); + { + ctx_p = __rda_inode_ctx_get(inode, this); + if (ctx_p) { + *attr = ctx_p->statbuf; + } + } + UNLOCK(&inode->lock); + +out: + return; +} + +/* + * Serve a request from the fd dentry list based on the size of the request + * buffer. ctx must be locked. + */ +static int32_t +__rda_fill_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size, + struct rda_fd_ctx *ctx) +{ + gf_dirent_t *dirent, *tmp; + size_t dirent_size, size = 0; + int32_t count = 0; + struct rda_priv *priv = NULL; + struct iatt tmp_stat = { + 0, + }; + + priv = this->private; + + list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) + { + dirent_size = gf_dirent_size(dirent->d_name); + if (size + dirent_size > request_size) + break; + + memset(&tmp_stat, 0, sizeof(tmp_stat)); + + if (dirent->inode && (!((strcmp(dirent->d_name, ".") == 0) || + (strcmp(dirent->d_name, "..") == 0)))) { + rda_inode_ctx_get_iatt(dirent->inode, this, &tmp_stat); + dirent->d_stat = tmp_stat; + } + + size += dirent_size; + list_del_init(&dirent->list); + ctx->cur_size -= dirent_size; + + GF_ATOMIC_SUB(priv->rda_cache_size, dirent_size); + + list_add_tail(&dirent->list, &entries->list); + ctx->cur_offset = dirent->d_off; + count++; + } + + if (ctx->cur_size <= priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + + return count; +} + +static int32_t +__rda_serve_readdirp(xlator_t *this, struct rda_fd_ctx *ctx, size_t size, + gf_dirent_t *entries, int *op_errno) +{ + int32_t ret = 0; + + ret = __rda_fill_readdirp(this, entries, size, ctx); + + if (!ret && (ctx->state & RDA_FD_ERROR)) { + ret = -1; + ctx->state &= ~RDA_FD_ERROR; + + /* + * the preload has stopped running in the event of an error, so + * pass all future requests along + */ + ctx->state |= RDA_FD_BYPASS; + } + /* + * Use the op_errno sent by lower layers as xlators above will check + * the op_errno for identifying whether readdir is completed or not. + */ + *op_errno = ctx->op_errno; + + return ret; +} + +static int32_t +rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + struct rda_fd_ctx *ctx = NULL; + int fill = 0; + gf_dirent_t entries; + int ret = 0; + int op_errno = 0; + gf_boolean_t serve = _gf_false; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + if (ctx->state & RDA_FD_BYPASS) + goto bypass; + + INIT_LIST_HEAD(&entries.list); + LOCK(&ctx->lock); + + /* recheck now that we have the lock */ + if (ctx->state & RDA_FD_BYPASS) { + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If a new read comes in at offset 0 and the buffer has been + * completed, reset the context and kickstart the filler again. + */ + if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) { + rda_reset_ctx(this, ctx); + /* + * Unref and discard the 'list of xattrs to be fetched' + * stored during opendir call. This is done above - inside + * rda_reset_ctx(). + * Now, ref the xdata passed by md-cache in actual readdirp() + * call and use that for all subsequent internal readdirp() + * requests issued by this xlator. + */ + ctx->xattrs = dict_ref(xdata); + fill = 1; + } + + /* + * If a readdir occurs at an unexpected offset or we already have a + * request pending, admit defeat and just get out of the way. + */ + if (off != ctx->cur_offset || ctx->stub) { + ctx->state |= RDA_FD_BYPASS; + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If we haven't bypassed the preload, this means we can either serve + * the request out of the preload or the request that enables us to do + * so is in flight... + */ + if (rda_can_serve_readdirp(ctx, size)) { + ret = __rda_serve_readdirp(this, ctx, size, &entries, &op_errno); + serve = _gf_true; + + if (op_errno == ENOENT && + !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0))) + op_errno = 0; + } else { + ctx->stub = fop_readdirp_stub(frame, NULL, fd, size, off, xdata); + if (!ctx->stub) { + UNLOCK(&ctx->lock); + goto err; + } + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 1; + if (!ctx->xattrs) + ctx->xattrs = dict_ref(xdata); + ctx->state |= RDA_FD_RUNNING; + } + } + + UNLOCK(&ctx->lock); + + if (serve) { + STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + } + + if (fill) + rda_fill_fd(frame, this, fd); + + return 0; + +bypass: + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *dirent = NULL; + gf_dirent_t *tmp = NULL; + gf_dirent_t serve_entries; + struct rda_local *local = frame->local; + struct rda_fd_ctx *ctx = local->ctx; + struct rda_priv *priv = this->private; + int fill = 1; + size_t dirent_size = 0; + int ret = 0; + gf_boolean_t serve = _gf_false; + call_stub_t *stub = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + uint64_t generation = 0; + call_frame_t *fill_frame = NULL; + + INIT_LIST_HEAD(&serve_entries.list); + LOCK(&ctx->lock); + + /* Verify that the preload buffer is still pending on this data. */ + if (ctx->next_offset != local->offset) { + gf_msg(this->name, GF_LOG_ERROR, 0, READDIR_AHEAD_MSG_OUT_OF_SEQUENCE, + "Out of sequence directory preload."); + ctx->state |= (RDA_FD_BYPASS | RDA_FD_ERROR); + ctx->op_errno = EUCLEAN; + + goto out; + } + + if (entries) { + list_for_each_entry_safe(dirent, tmp, &entries->list, list) + { + list_del_init(&dirent->list); + + /* must preserve entry order */ + list_add_tail(&dirent->list, &ctx->entries.list); + if (dirent->inode) { + /* If ctxp->stat is invalidated, don't update it + * with dirent->d_stat as we don't have + * generation number of the inode when readdirp + * request was initiated. So, we pass 0 for + * generation number + */ + + generation = -1; + if (ctx->writes_during_prefetch) { + memset(gfid, 0, sizeof(gfid)); + uuid_utoa_r(dirent->inode->gfid, gfid); + if (dict_get(ctx->writes_during_prefetch, gfid)) + generation = 0; + } + + if (!((strcmp(dirent->d_name, ".") == 0) || + (strcmp(dirent->d_name, "..") == 0))) { + rda_inode_ctx_update_iatts(dirent->inode, this, + &dirent->d_stat, &dirent->d_stat, + generation); + } + } + + dirent_size = gf_dirent_size(dirent->d_name); + + ctx->cur_size += dirent_size; + + GF_ATOMIC_ADD(priv->rda_cache_size, dirent_size); + + ctx->next_offset = dirent->d_off; + } + } + + if (ctx->writes_during_prefetch) { + dict_unref(ctx->writes_during_prefetch); + ctx->writes_during_prefetch = NULL; + } + + GF_ATOMIC_DEC(ctx->prefetching); + + if (ctx->cur_size >= priv->rda_high_wmark) + ctx->state &= ~RDA_FD_PLUGGED; + + if (!op_ret || op_errno == ENOENT) { + /* we've hit eod */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_EOD; + ctx->op_errno = op_errno; + } else if (op_ret == -1) { + /* kill the preload and pend the error */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_ERROR; + ctx->op_errno = op_errno; + } + + /* + * NOTE: The strict bypass logic in readdirp() means a pending request + * is always based on ctx->cur_offset. + */ + if (ctx->stub && rda_can_serve_readdirp(ctx, ctx->stub->args.size)) { + ret = __rda_serve_readdirp(this, ctx, ctx->stub->args.size, + &serve_entries, &op_errno); + serve = _gf_true; + stub = ctx->stub; + ctx->stub = NULL; + } + +out: + /* + * If we have been marked for bypass and have no pending stub, clear the + * run state so we stop preloading the context with entries. + */ + if (!ctx->stub && + ((ctx->state & RDA_FD_BYPASS) || + GF_ATOMIC_GET(priv->rda_cache_size) > priv->rda_cache_limit)) + ctx->state &= ~RDA_FD_RUNNING; + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 0; + if (ctx->xattrs) { + /* + * fill = 0 and hence rda_fill_fd() won't be invoked. + * unref for ref taken in rda_fill_fd() + */ + dict_unref(ctx->xattrs); + ctx->xattrs = NULL; + } + + fill_frame = ctx->fill_frame; + ctx->fill_frame = NULL; + } + + if (op_errno == ENOENT && + !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0))) + op_errno = 0; + + UNLOCK(&ctx->lock); + if (fill_frame) { + rda_local_wipe(fill_frame->local); + STACK_DESTROY(fill_frame->root); + } + + if (serve) { + STACK_UNWIND_STRICT(readdirp, stub->frame, ret, op_errno, + &serve_entries, xdata); + gf_dirent_free(&serve_entries); + call_stub_destroy(stub); + } + + if (fill) + rda_fill_fd(frame, this, local->fd); + + return 0; +} + +/* + * Start prepopulating the fd context with directory entries. + */ +static int +rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + call_frame_t *nframe = NULL; + struct rda_local *local = NULL; + struct rda_local *orig_local = frame->local; + struct rda_fd_ctx *ctx; + off_t offset; + struct rda_priv *priv = this->private; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + LOCK(&ctx->lock); + + if (ctx->state & RDA_FD_NEW) { + ctx->state &= ~RDA_FD_NEW; + ctx->state |= RDA_FD_RUNNING; + if (priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + } + + offset = ctx->next_offset; + + if (!ctx->fill_frame) { + nframe = copy_frame(frame); + if (!nframe) { + UNLOCK(&ctx->lock); + goto err; + } + + local = mem_get0(this->local_pool); + if (!local) { + UNLOCK(&ctx->lock); + goto err; + } + + local->ctx = ctx; + local->fd = fd_ref(fd); + nframe->local = local; + + ctx->fill_frame = nframe; + + if (!ctx->xattrs && orig_local && orig_local->xattrs) { + /* when this function is invoked by rda_opendir_cbk */ + ctx->xattrs = dict_ref(orig_local->xattrs); + } + } else { + nframe = ctx->fill_frame; + local = nframe->local; + } + + local->offset = offset; + GF_ATOMIC_INC(ctx->prefetching); + + UNLOCK(&ctx->lock); + + STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size, + offset, ctx->xattrs); + + return 0; + +err: + if (nframe) { + rda_local_wipe(nframe->local); + FRAME_DESTROY(nframe); + } + + return -1; +} + +static int32_t +rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + if (!op_ret) + rda_fill_fd(frame, this, fd); + + RDA_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + int op_errno = 0; + struct rda_local *local = NULL; + + if (xdata) { + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } + + /* + * Retrieve list of keys set by md-cache xlator and store it + * in local to be consumed in rda_opendir_cbk + */ + local->xattrs = dict_copy_with_ref(xdata, NULL); + frame->local = local; + } + + STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + + rda_mark_inode_dirty(this, local->inode); + + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(writev, frame, this, fd->inode, xdata, fd, + vector, count, off, flags, iobref); + return 0; +} + +static int32_t +rda_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fallocate, frame, this, fd->inode, xdata, fd, + keep_size, offset, len); + return 0; +} + +static int32_t +rda_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(zerofill, frame, this, fd->inode, xdata, fd, + offset, len); + return 0; +} + +static int32_t +rda_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(discard, frame, this, fd->inode, xdata, fd, + offset, len); + return 0; +} + +static int32_t +rda_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(ftruncate, frame, this, fd->inode, xdata, fd, + offset); + return 0; +} + +static int32_t +rda_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(truncate, frame, this, loc->inode, xdata, loc, + offset); + return 0; +} + +static int32_t +rda_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(setxattr, frame, this, loc->inode, xdata, loc, + dict, flags); + return 0; +} + +static int32_t +rda_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fsetxattr, frame, this, fd->inode, xdata, fd, + dict, flags); + return 0; +} + +static int32_t +rda_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(setattr, frame, this, loc->inode, xdata, loc, + stbuf, valid); + return 0; +} + +static int32_t +rda_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + struct rda_local *local = NULL; + struct iatt postbuf_out = { + 0, + }; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out, + local->generation); + +unwind: + RDA_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, &postbuf_out, + xdata); + return 0; +} + +static int32_t +rda_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fsetattr, frame, this, fd->inode, xdata, fd, + stbuf, valid); + return 0; +} + +static int32_t +rda_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(removexattr, frame, this, loc->inode, xdata, + loc, name); + return 0; +} + +static int32_t +rda_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + struct rda_local *local = NULL; + + if (op_ret < 0) + goto unwind; + + local = frame->local; + rda_mark_inode_dirty(this, local->inode); + rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL, + local->generation); +unwind: + RDA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int32_t +rda_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + RDA_COMMON_MODIFICATION_FOP(fremovexattr, frame, this, fd->inode, xdata, fd, + name); + return 0; +} + +static int32_t +rda_releasedir(xlator_t *this, fd_t *fd) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + if (fd_ctx_del(fd, this, &val) < 0) + return -1; + + ctx = (struct rda_fd_ctx *)(uintptr_t)val; + if (!ctx) + return 0; + + rda_reset_ctx(this, ctx); + + if (ctx->fill_frame) + STACK_DESTROY(ctx->fill_frame->root); + + if (ctx->stub) + gf_msg(this->name, GF_LOG_ERROR, 0, + READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB, + "released a directory with a pending stub"); + + GF_FREE(ctx); + return 0; +} + +static int +rda_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_uint = 0; + rda_inode_ctx_t *ctx = NULL; + + inode_ctx_del1(inode, this, &ctx_uint); + if (!ctx_uint) + return 0; + + ctx = (rda_inode_ctx_t *)(uintptr_t)ctx_uint; + + GF_FREE(ctx); + + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + goto out; + + ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1); + + if (ret != 0) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READDIR_AHEAD_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + struct rda_priv *priv = this->private; + + GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options, + size_uint64, err); + GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size_uint64, + err); + GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, + size_uint64, err); + GF_OPTION_RECONF("rda-cache-limit", priv->rda_cache_limit, options, + size_uint64, err); + GF_OPTION_RECONF("parallel-readdir", priv->parallel_readdir, options, bool, + err); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, err); + + return 0; +err: + return -1; +} + +int +init(xlator_t *this) +{ + struct rda_priv *priv = NULL; + + GF_VALIDATE_OR_GOTO("readdir-ahead", this, err); + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: readdir-ahead not configured with exactly one" + " child"); + goto err; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, + READDIR_AHEAD_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv); + if (!priv) + goto err; + this->private = priv; + + GF_ATOMIC_INIT(priv->rda_cache_size, 0); + + this->local_pool = mem_pool_new(struct rda_local, 32); + if (!this->local_pool) + goto err; + + GF_OPTION_INIT("rda-request-size", priv->rda_req_size, size_uint64, err); + GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size_uint64, err); + GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size_uint64, err); + GF_OPTION_INIT("rda-cache-limit", priv->rda_cache_limit, size_uint64, err); + GF_OPTION_INIT("parallel-readdir", priv->parallel_readdir, bool, err); + GF_OPTION_INIT("pass-through", this->pass_through, bool, err); + + return 0; + +err: + if (this->local_pool) + mem_pool_destroy(this->local_pool); + if (priv) + GF_FREE(priv); + + return -1; +} + +void +fini(xlator_t *this) +{ + GF_VALIDATE_OR_GOTO("readdir-ahead", this, out); + + GF_FREE(this->private); + +out: + return; +} + +struct xlator_fops fops = { + .opendir = rda_opendir, + .readdirp = rda_readdirp, + /* inode write */ + /* TODO: invalidate a dentry's stats if its pointing to a directory + * when entry operations happen in that directory + */ + .writev = rda_writev, + .truncate = rda_truncate, + .ftruncate = rda_ftruncate, + .fallocate = rda_fallocate, + .discard = rda_discard, + .zerofill = rda_zerofill, + /* metadata write */ + .setxattr = rda_setxattr, + .fsetxattr = rda_fsetxattr, + .setattr = rda_setattr, + .fsetattr = rda_fsetattr, + .removexattr = rda_removexattr, + .fremovexattr = rda_fremovexattr, +}; + +struct xlator_cbks cbks = { + .releasedir = rda_releasedir, + .forget = rda_forget, +}; + +struct volume_options options[] = { + { + .key = {"readdir-ahead"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable readdir-ahead", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"rda-request-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 131072, + .default_value = "131072", + .description = "size of buffer in readdirp calls initiated by " + "readdir-ahead ", + }, + { + .key = {"rda-low-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 10 * GF_UNIT_MB, + .default_value = "4096", + .description = "the value under which readdir-ahead plugs", + }, + { + .key = {"rda-high-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 100 * GF_UNIT_MB, + .default_value = "128KB", + .description = "the value over which readdir-ahead unplugs", + }, + { + .key = {"rda-cache-limit"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = INFINITY, + .default_value = "10MB", + .description = "maximum size of cache consumed by readdir-ahead " + "xlator. This value is global and total memory " + "consumption by readdir-ahead is capped by this " + "value, irrespective of the number/size of " + "directories cached", + }, + {.key = {"parallel-readdir"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {GD_OP_VERSION_3_10_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .default_value = "off", + .description = "If this option is enabled, the readdir operation " + "is performed in parallel on all the bricks, thus " + "improving the performance of readdir. Note that " + "the performance improvement is higher in large " + "clusters"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"readdir-ahead"}, + .description = "Enable/Disable readdir ahead translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "readdir-ahead", + .category = GF_MAINTAINED, +}; diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h new file mode 100644 index 00000000000..619c41059ff --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h @@ -0,0 +1,98 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __READDIR_AHEAD_H +#define __READDIR_AHEAD_H + +/* state flags */ +#define RDA_FD_NEW (1 << 0) +#define RDA_FD_RUNNING (1 << 1) +#define RDA_FD_EOD (1 << 2) +#define RDA_FD_ERROR (1 << 3) +#define RDA_FD_BYPASS (1 << 4) +#define RDA_FD_PLUGGED (1 << 5) + +#define RDA_COMMON_MODIFICATION_FOP(name, frame, this, __inode, __xdata, \ + args...) \ + do { \ + struct rda_local *__local = NULL; \ + rda_inode_ctx_t *ctx_p = NULL; \ + \ + __local = mem_get0(this->local_pool); \ + __local->inode = inode_ref(__inode); \ + LOCK(&__inode->lock); \ + { \ + ctx_p = __rda_inode_ctx_get(__inode, this); \ + } \ + UNLOCK(&__inode->lock); \ + __local->generation = GF_ATOMIC_GET(ctx_p->generation); \ + \ + frame->local = __local; \ + if (__xdata) \ + __local->xattrs = dict_ref(__xdata); \ + \ + STACK_WIND(frame, rda_##name##_cbk, FIRST_CHILD(this), \ + FIRST_CHILD(this)->fops->name, args, __xdata); \ + } while (0) + +#define RDA_STACK_UNWIND(fop, frame, params...) \ + do { \ + struct rda_local *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local) { \ + rda_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0) + +struct rda_fd_ctx { + off_t cur_offset; /* current head of the ctx */ + size_t cur_size; /* current size of the preload */ + off_t next_offset; /* tail of the ctx */ + uint32_t state; + gf_lock_t lock; + gf_dirent_t entries; + call_frame_t *fill_frame; + call_stub_t *stub; + int op_errno; + dict_t *xattrs; /* md-cache keys to be sent in readdirp() */ + dict_t *writes_during_prefetch; + gf_atomic_t prefetching; +}; + +struct rda_local { + struct rda_fd_ctx *ctx; + fd_t *fd; + dict_t *xattrs; /* md-cache keys to be sent in readdirp() */ + inode_t *inode; + off_t offset; + uint64_t generation; + int32_t skip_dir; +}; + +struct rda_priv { + uint64_t rda_req_size; + uint64_t rda_low_wmark; + uint64_t rda_high_wmark; + uint64_t rda_cache_limit; + gf_atomic_t rda_cache_size; + gf_boolean_t parallel_readdir; +}; + +typedef struct rda_inode_ctx { + struct iatt statbuf; + gf_atomic_t generation; +} rda_inode_ctx_t; + +#endif /* __READDIR_AHEAD_H */ diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am deleted file mode 100644 index cfb13071486..00000000000 --- a/xlators/performance/stat-prefetch/src/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -xlator_LTLIBRARIES = stat-prefetch.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance - -stat_prefetch_la_LDFLAGS = -module -avoidversion -stat_prefetch_la_SOURCES = stat-prefetch.c -noinst_HEADERS = stat-prefetch.h stat-prefetch-mem-types.h - -stat_prefetch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -I$(CONTRIBDIR)/rbtree -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = - diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch-mem-types.h b/xlators/performance/stat-prefetch/src/stat-prefetch-mem-types.h deleted file mode 100644 index 156b3472bc6..00000000000 --- a/xlators/performance/stat-prefetch/src/stat-prefetch-mem-types.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef __SP_MEM_TYPES_H__ -#define __SP_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_sp_mem_types_ { - gf_sp_mt_sp_cache_t = gf_common_mt_end + 1, - gf_sp_mt_sp_fd_ctx_t, - gf_sp_mt_stat, - gf_sp_mt_sp_local_t, - gf_sp_mt_sp_inode_ctx_t, - gf_sp_mt_sp_private_t, - gf_sp_mt_fd_wrapper_t, - gf_sp_mt_end -}; -#endif diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c deleted file mode 100644 index 73cc3a955d8..00000000000 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.c +++ /dev/null @@ -1,4257 +0,0 @@ -/* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#include "stat-prefetch.h" -#include "statedump.h" -#include "fd.h" - -#define GF_SP_CACHE_BUCKETS 1 -#define GF_SP_CACHE_ENTRIES_EXPECTED (128 * 1024) //1048576 - -typedef enum { - SP_EXPECT, - SP_DONT_EXPECT, - SP_DONT_CARE -} sp_expect_t; - - -void -sp_inode_ctx_free (xlator_t *this, sp_inode_ctx_t *ctx) -{ - call_stub_t *stub = NULL, *tmp = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, ctx, out); - - LOCK (&ctx->lock); - { - if (!list_empty (&ctx->waiting_ops)) { - gf_log (this->name, GF_LOG_WARNING, "inode ctx is " - "being freed even when there are file " - "operations waiting for lookup-behind to " - "complete. The operations in the waiting list " - "are:"); - list_for_each_entry_safe (stub, tmp, &ctx->waiting_ops, - list) { - gf_log (this->name, GF_LOG_WARNING, - "OP (%s)", gf_fop_list[stub->fop]); - - list_del_init (&stub->list); - call_stub_destroy (stub); - } - } - } - UNLOCK (&ctx->lock); - - LOCK_DESTROY (&ctx->lock); - GF_FREE (ctx); - -out: - return; -} - - -sp_inode_ctx_t * -sp_inode_ctx_init () -{ - sp_inode_ctx_t *inode_ctx = NULL; - - inode_ctx = GF_CALLOC (1, sizeof (*inode_ctx), gf_sp_mt_sp_inode_ctx_t); - if (inode_ctx == NULL) { - goto out; - } - - LOCK_INIT (&inode_ctx->lock); - INIT_LIST_HEAD (&inode_ctx->waiting_ops); - -out: - return inode_ctx; -} - - -int -sp_update_inode_ctx (xlator_t *this, inode_t *inode, int32_t *op_ret, - int32_t *op_errno, char *lookup_in_progress, - char *looked_up, struct iatt *stbuf, - struct list_head *waiting_ops, int32_t *error) -{ - int32_t ret = -1; - sp_inode_ctx_t *inode_ctx = NULL; - uint64_t value = 0; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - ret = inode_ctx_get (inode, this, &value); - if (ret == 0) { - inode_ctx = (sp_inode_ctx_t *)(long)value; - } - - if (inode_ctx == NULL) { - ret = -1; - if (error != NULL) { - *error = EINVAL; - } - - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode " - "(gfid:%s)", uuid_utoa (inode->gfid)); - goto out; - } - - LOCK (&inode_ctx->lock); - { - if (op_ret != NULL) { - inode_ctx->op_ret = *op_ret; - } - - if (op_errno != NULL) { - inode_ctx->op_errno = *op_errno; - } - - if (looked_up != NULL) { - inode_ctx->looked_up = *looked_up; - } - - if (lookup_in_progress != NULL) { - inode_ctx->lookup_in_progress = *lookup_in_progress; - } - - if ((op_ret != NULL ) && (*op_ret == 0) && (stbuf != NULL) - && IA_ISDIR (stbuf->ia_type)) { - memcpy (&inode_ctx->stbuf, stbuf, sizeof (*stbuf)); - } - - if (waiting_ops != NULL) { - list_splice_init (&inode_ctx->waiting_ops, waiting_ops); - } - } - UNLOCK (&inode_ctx->lock); - - ret = 0; - -out: - return ret; -} - - -sp_inode_ctx_t * -sp_check_and_create_inode_ctx (xlator_t *this, inode_t *inode, - sp_expect_t expect) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &value); - if (ret == 0) { - inode_ctx = (sp_inode_ctx_t *)(long)value; - - if ((expect == SP_DONT_EXPECT) && (inode_ctx != NULL)) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "stat-prefetch context is " - "present in inode " - "(gfid:%s) " - "when it is supposed to be " - "not present", - uuid_utoa (inode->gfid)); - } - } else { - if (expect == SP_EXPECT) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "stat-prefetch context is " - "not present in inode " - "(gfid:%s)" - " when it is supposed to be " - "present", - uuid_utoa (inode->gfid)); - } - - inode_ctx = sp_inode_ctx_init (); - if (inode_ctx != NULL) { - ret = __inode_ctx_put (inode, this, - (long)inode_ctx); - if (ret == -1) { - sp_inode_ctx_free (this, inode_ctx); - inode_ctx = NULL; - } - } - } - } - UNLOCK (&inode->lock); - -out: - return inode_ctx; -} - - -sp_cache_t * -sp_cache_ref (sp_cache_t *cache) -{ - if (cache == NULL) { - goto out; - } - - LOCK (&cache->lock); - { - cache->ref++; - } - UNLOCK (&cache->lock); - -out: - return cache;; -} - - -void -sp_cache_unref (sp_cache_t *cache) -{ - int refcount = 0; - - if (cache == NULL) { - goto out; - } - - LOCK (&cache->lock); - { - refcount = --cache->ref; - } - UNLOCK (&cache->lock); - - if (refcount == 0) { - rbthash_table_destroy (cache->table); - GF_FREE (cache); - } - -out: - return; -} - - -int32_t -sp_process_inode_ctx (call_frame_t *frame, xlator_t *this, loc_t *loc, - call_stub_t *stub, char *need_unwind, char *need_lookup, - char *can_wind, int32_t *error) -{ - int32_t ret = -1, op_errno = EINVAL; - sp_local_t *local = NULL; - sp_inode_ctx_t *inode_ctx = NULL; - uint64_t value = 0; - - if (need_unwind != NULL) { - *need_unwind = 1; - } - - GF_VALIDATE_OR_GOTO ("stat-prefetch", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, this, out); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, out); - GF_VALIDATE_OR_GOTO (frame->this->name, loc->inode, out); - GF_VALIDATE_OR_GOTO (frame->this->name, need_unwind, out); - GF_VALIDATE_OR_GOTO (frame->this->name, need_lookup, out); - GF_VALIDATE_OR_GOTO (frame->this->name, can_wind, out); - - inode_ctx_get (loc->inode, this, &value); - - inode_ctx = (sp_inode_ctx_t *)(long) value; - if (inode_ctx == NULL) { - gf_log_callingfn (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode " - "(gfid:%s)", uuid_utoa (loc->inode->gfid)); - *can_wind = 1; - *need_unwind = 0; - op_errno = 0; - ret = 0; - goto out; - } - - LOCK (&inode_ctx->lock); - { - if (!(inode_ctx->looked_up || inode_ctx->lookup_in_progress)) { - if (frame->local == NULL) { - local = GF_CALLOC (1, sizeof (*local), - gf_sp_mt_sp_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unlock; - } - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, - "loc_copy failed (%s)", - strerror (op_errno)); - goto unlock; - } - } - - *need_lookup = 1; - inode_ctx->lookup_in_progress = 1; - } - - if (inode_ctx->looked_up) { - *can_wind = 1; - } else { - list_add_tail (&stub->list, &inode_ctx->waiting_ops); - stub = NULL; - } - - *need_unwind = 0; - ret = 0; - } -unlock: - UNLOCK (&inode_ctx->lock); - -out: - if (stub != NULL) { - call_stub_destroy (stub); - } - - if (error != NULL) { - *error = op_errno; - } - - return ret; -} - - -inline uint32_t -sp_hashfn (void *data, int len) -{ - return gf_dm_hashfn ((const char *)data, len); -} - - -sp_cache_t * -sp_cache_init (xlator_t *this) -{ - sp_cache_t *cache = NULL; - sp_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - GF_VALIDATE_OR_GOTO (this->name, priv->mem_pool, out); - - cache = GF_CALLOC (1, sizeof (*cache), gf_sp_mt_sp_cache_t); - if (cache) { - cache->table = - rbthash_table_init (GF_SP_CACHE_BUCKETS, - sp_hashfn, __gf_free, - 0, priv->mem_pool); - if (cache->table == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "cannot init a new rbthash table to hold " - "cache"); - GF_FREE (cache); - cache = NULL; - goto out; - } - - LOCK_INIT (&cache->lock); - cache->this = this; - } - -out: - return cache; -} - - -void -sp_local_free (sp_local_t *local) -{ - if (local) { - loc_wipe (&local->loc); - GF_FREE (local); - } -} - - -int32_t -sp_cache_remove_entry (sp_cache_t *cache, char *name, char remove_all) -{ - int32_t ret = -1; - rbthash_table_t *table = NULL; - xlator_t *this = NULL; - sp_private_t *priv = NULL; - void *data = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", cache, out); - if ((name == NULL) && !remove_all) { - gf_log ((cache->this ? cache->this->name : "stat-prefetch"), - GF_LOG_WARNING, - "request to remove a single entry from cache and is no " - "name passed to identify it"); - goto out; - } - - this = cache->this; - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); - - priv = this->private; - - LOCK (&cache->lock); - { - if (remove_all) { - table = cache->table; - cache->table = rbthash_table_init (GF_SP_CACHE_BUCKETS, - sp_hashfn, __gf_free, - 0, priv->mem_pool); - if (cache->table == NULL) { - cache->table = table; - } else { - rbthash_table_destroy (table); - ret = 0; - if (priv) { - LOCK (&priv->lock); - { - priv->entries = 0; - } - UNLOCK (&priv->lock); - } - } - } else { - data = rbthash_remove (cache->table, name, - strlen (name)); - GF_FREE (data); - ret = 0; - if (priv) { - LOCK (&priv->lock); - { - priv->entries--; - } - UNLOCK (&priv->lock); - } - } - } - UNLOCK (&cache->lock); - -out: - return ret; -} - - -int32_t -sp_cache_get_entry (sp_cache_t *cache, char *name, gf_dirent_t **entry) -{ - int32_t ret = -1; - gf_dirent_t *tmp = NULL, *new = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", cache, out); - GF_VALIDATE_OR_GOTO ("stat-prefetch", cache->this, out); - GF_VALIDATE_OR_GOTO (cache->this->name, name, out); - GF_VALIDATE_OR_GOTO (cache->this->name, entry, out); - - LOCK (&cache->lock); - { - tmp = rbthash_get (cache->table, name, strlen (name)); - if (tmp != NULL) { - new = gf_dirent_for_name (tmp->d_name); - if (new == NULL) { - gf_log (cache->this->name, GF_LOG_WARNING, - "cannot create a new dentry to copy " - "from cache"); - goto unlock; - } - - new->d_ino = tmp->d_ino; - new->d_off = tmp->d_off; - new->d_len = tmp->d_len; - new->d_type = tmp->d_type; - new->d_stat = tmp->d_stat; - - *entry = new; - ret = 0; - } - } -unlock: - UNLOCK (&cache->lock); - -out: - return ret; -} - - -void -sp_cache_free (sp_cache_t *cache) -{ - sp_cache_remove_entry (cache, NULL, 1); - sp_cache_unref (cache); -} - - -sp_cache_t * -__sp_get_cache_fd (xlator_t *this, fd_t *fd) -{ - int32_t ret = -1; - sp_cache_t *cache = NULL; - uint64_t value = 0; - sp_fd_ctx_t *fd_ctx = NULL; - - ret = __fd_ctx_get (fd, this, &value); - if (ret == -1) { - goto out; - } - - fd_ctx = (void *)(long) value; - - cache = fd_ctx->cache; - -out: - return cache; -} - - -sp_cache_t * -sp_get_cache_fd (xlator_t *this, fd_t *fd) -{ - sp_cache_t *cache = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - - LOCK (&fd->lock); - { - cache = __sp_get_cache_fd (this, fd); - if (cache != NULL) { - sp_cache_ref (cache); - } - } - UNLOCK (&fd->lock); - -out: - return cache; -} - - -void -sp_fd_ctx_free (sp_fd_ctx_t *fd_ctx) -{ - if (fd_ctx == NULL) { - goto out; - } - - if (fd_ctx->parent_inode) { - inode_unref (fd_ctx->parent_inode); - fd_ctx->parent_inode = NULL; - } - - if (fd_ctx->name) { - GF_FREE (fd_ctx->name); - fd_ctx->name = NULL; - } - - if (fd_ctx->cache) { - sp_cache_free (fd_ctx->cache); - } - - GF_FREE (fd_ctx); -out: - return; -} - - -inline sp_fd_ctx_t * -sp_fd_ctx_init (void) -{ - sp_fd_ctx_t *fd_ctx = NULL; - - fd_ctx = GF_CALLOC (1, sizeof (*fd_ctx), gf_sp_mt_sp_fd_ctx_t); - - return fd_ctx; -} - - -sp_fd_ctx_t * -sp_fd_ctx_new (xlator_t *this, inode_t *parent, char *name, sp_cache_t *cache) -{ - sp_fd_ctx_t *fd_ctx = NULL; - - fd_ctx = sp_fd_ctx_init (); - if (fd_ctx == NULL) { - goto out; - } - - if (parent) { - fd_ctx->parent_inode = inode_ref (parent); - } - - if (name) { - fd_ctx->name = gf_strdup (name); - if (fd_ctx->name == NULL) { - sp_fd_ctx_free (fd_ctx); - fd_ctx = NULL; - goto out; - } - } - - fd_ctx->cache = cache; - -out: - return fd_ctx; -} - - -sp_cache_t * -sp_del_cache_fd (xlator_t *this, fd_t *fd) -{ - sp_cache_t *cache = NULL; - uint64_t value = 0; - int32_t ret = -1; - sp_fd_ctx_t *fd_ctx = NULL; - - if (fd == NULL) { - goto out; - } - - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &value); - if (ret == 0) { - fd_ctx = (void *)(long) value; - cache = fd_ctx->cache; - fd_ctx->cache = NULL; - } - } - UNLOCK (&fd->lock); - -out: - return cache; -} - - -sp_cache_t * -sp_get_cache_inode (xlator_t *this, inode_t *inode, int32_t pid) -{ - fd_t *fd = NULL; - sp_cache_t *cache = NULL; - - if (inode == NULL) { - goto out; - } - - fd = fd_lookup (inode, pid); - if (fd == NULL) { - goto out; - } - - cache = sp_get_cache_fd (this, fd); - - fd_unref (fd); -out: - return cache; -} - - -void -sp_remove_caches_from_all_fds_opened (xlator_t *this, inode_t *inode, - char *name) -{ - fd_t *fd = NULL; - sp_cache_t *cache = NULL; - struct fd_wrapper { - fd_t *fd; - struct list_head list; - }; - - struct fd_wrapper *wrapper = NULL, *tmp = NULL; - struct list_head head = {0, }; - char remove_all = 0; - - wrapper = NULL; - - INIT_LIST_HEAD (&head); - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - remove_all = (name == NULL); - - LOCK (&inode->lock); - { - list_for_each_entry (fd, &inode->fd_list, inode_list) { - wrapper = GF_CALLOC (1, sizeof (*wrapper), - gf_sp_mt_fd_wrapper_t); - if (wrapper == NULL) { - goto unlock; - } - - INIT_LIST_HEAD (&wrapper->list); - - wrapper->fd = __fd_ref (fd); - list_add_tail (&wrapper->list, &head); - } - } -unlock: - UNLOCK (&inode->lock); - - list_for_each_entry_safe (wrapper, tmp, &head, list) { - cache = sp_get_cache_fd (this, wrapper->fd); - if (cache) { - sp_cache_remove_entry (cache, name, remove_all); - sp_cache_unref (cache); - } - - list_del (&wrapper->list); - fd_unref (wrapper->fd); - GF_FREE (wrapper); - } - -out: - return; -} - - -inline int32_t -__sp_put_cache (xlator_t *this, fd_t *fd, sp_cache_t *cache) -{ - sp_fd_ctx_t *fd_ctx = NULL; - int32_t ret = -1; - uint64_t value = 0; - - ret = __fd_ctx_get (fd, this, &value); - if (!ret) { - fd_ctx = (void *)(long)value; - } else { - fd_ctx = sp_fd_ctx_init (); - if (fd_ctx == NULL) { - ret = -1; - goto out; - } - - ret = __fd_ctx_set (fd, this, (long)(void *)fd_ctx); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set stat-prefetch context in fd (%p) " - "opened on inode (gfid:%s)", - fd, uuid_utoa (fd->inode->gfid)); - sp_fd_ctx_free (fd_ctx); - goto out; - } - } - - if (fd_ctx->cache) { - sp_cache_free (fd_ctx->cache); - } - - fd_ctx->cache = cache; - -out: - return ret; -} - - -inline int32_t -sp_put_cache (xlator_t *this, fd_t *fd, sp_cache_t *cache) -{ - int32_t ret = -1; - - if (fd != NULL) { - LOCK (&fd->lock); - { - ret = __sp_put_cache (this, fd, cache); - } - UNLOCK (&fd->lock); - } - - return ret; -} - - -int32_t -sp_cache_add_entries (sp_cache_t *cache, gf_dirent_t *entries) -{ - gf_dirent_t *entry = NULL, *new = NULL; - int32_t ret = -1; - uint64_t expected_offset = 0; - xlator_t *this = NULL; - sp_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", cache, out); - - this = cache->this; - if (this && this->private) { - priv = this->private; - } - - LOCK (&cache->lock); - { - list_for_each_entry (entry, &entries->list, list) { - if (IA_ISDIR (entry->d_stat.ia_type)) { - continue; - } - - if (uuid_is_null (entry->d_stat.ia_gfid)) - continue; - - new = gf_dirent_for_name (entry->d_name); - if (new == NULL) { - gf_log (cache->this->name, GF_LOG_WARNING, - "cannot create a new dentry to store " - "in cache"); - goto unlock; - } - - new->d_ino = entry->d_ino; - new->d_off = entry->d_off; - new->d_len = entry->d_len; - new->d_type = entry->d_type; - new->d_stat = entry->d_stat; - - ret = rbthash_insert (cache->table, new, new->d_name, - strlen (new->d_name)); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "cannot " - "insert dentry (name:%s) into cache", - new->d_name); - - GF_FREE (new); - continue; - } - - expected_offset = new->d_off; - if (priv) { - LOCK (&priv->lock); - { - priv->entries++; - } - UNLOCK (&priv->lock); - } - } - - cache->expected_offset = expected_offset; - - ret = 0; - } -unlock: - UNLOCK (&cache->lock); - -out: - return ret; -} - - -int32_t -sp_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *dict, struct iatt *postparent) -{ - struct list_head waiting_ops = {0, }; - call_stub_t *stub = NULL, *tmp = NULL; - sp_local_t *local = NULL; - int need_unwind = 0; - char looked_up = 0, lookup_in_progress = 0; - - GF_ASSERT (frame); - - INIT_LIST_HEAD (&waiting_ops); - - local = frame->local; - if (local == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, "local is NULL, but it is " - "needed to find and resume operations waiting on " - "lookup"); - goto out; - } - - if (this == NULL) { - op_ret = -1; - op_errno = EINVAL; - gf_log (frame->this ? frame->this->name : "stat-prefetch", - GF_LOG_WARNING, "xlator object (this) is NULL"); - goto out; - } - - /* For '/' Entry is never cached, don't try to remove it */ - if ((op_ret == -1) && local->loc.parent) { - sp_remove_caches_from_all_fds_opened (this, local->loc.parent, - (char *)local->loc.name); - } - - if (local->is_lookup) - need_unwind = 1; - - lookup_in_progress = 0; - looked_up = 1; - sp_update_inode_ctx (this, local->loc.inode, &op_ret, &op_errno, - &lookup_in_progress, &looked_up, buf, - &waiting_ops, &op_errno); - - list_for_each_entry_safe (stub, tmp, &waiting_ops, list) { - list_del_init (&stub->list); - call_resume (stub); - } - -out: - if (need_unwind) { - SP_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, - dict, postparent); - } - - return 0; -} - - -int32_t -sp_get_ancestors (char *path, char **parent, char **grand_parent) -{ - int32_t ret = -1, i = 0; - char *cpy = NULL, *tmp = NULL; - - if (!path || !parent || !grand_parent) { - ret = 0; - goto out; - } - - for (i = 0; i < 2; i++) { - if (!strcmp (path, "/")) { - break; - } - - tmp = cpy; - - cpy = gf_strdup (path); - - if (tmp != NULL) { - GF_FREE (tmp); - } - - if (cpy == NULL) { - ret = -errno; - goto out; - } - - path = dirname (cpy); - switch (i) - { - case 0: - *parent = gf_strdup (path); - if (*parent == NULL) - goto out; - break; - case 1: - *grand_parent = gf_strdup (path); - if (*grand_parent == NULL) - goto out; - break; - } - } - - ret = 0; -out: - if (cpy != NULL) - GF_FREE(cpy); - return ret; -} - - -int32_t -sp_cache_remove_parent_entry (call_frame_t *frame, xlator_t *this, - inode_table_t *itable, char *path) -{ - char *parent = NULL, *grand_parent = NULL, *cpy = NULL; - inode_t *inode_gp = NULL; - int32_t ret = -1; - - ret = sp_get_ancestors (path, &parent, &grand_parent); - if (ret < 0) { - goto out; - } - - if (grand_parent && strcmp (grand_parent, "/")) { - inode_gp = inode_from_path (itable, grand_parent); - if (inode_gp) { - cpy = gf_strdup (parent); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, - cpy, out, ret, - -ENOMEM); - path = basename (cpy); - sp_remove_caches_from_all_fds_opened (this, inode_gp, - path); - GF_FREE (cpy); - - inode_unref (inode_gp); - } - } - - ret = 0; -out: - if (parent) { - GF_FREE (parent); - } - - if (grand_parent) { - GF_FREE (grand_parent); - } - - return ret; -} - - -void -sp_is_empty (dict_t *this, char *key, data_t *value, void *data) -{ - char *ptr = data; - - if (strcmp (key, "gfid-req") == 0) - return; - - if (ptr && *ptr) { - *ptr = 0; - } -} - - -int32_t -sp_lookup_helper (call_frame_t *frame,xlator_t *this, loc_t *loc, - dict_t *xattr_req) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0; - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode " - "(gfid:%s)", uuid_utoa (loc->inode->gfid)); - op_errno = EINVAL; - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, inode_ctx, unwind, op_errno, - EINVAL); - - stub = fop_lookup_stub (frame, sp_lookup_helper, loc, xattr_req); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, stub, unwind, - op_errno, ENOMEM); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - if (op_ret == 0) { - if (!inode_ctx->lookup_in_progress) { - inode_ctx->lookup_in_progress = 1; - can_wind = 1; - } else { - list_add_tail (&stub->list, - &inode_ctx->waiting_ops); - stub = NULL; - } - } - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - goto unwind; - } - - if (can_wind) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, - xattr_req); - } - - if (stub != NULL) { - call_stub_destroy (stub); - } - - return 0; - -unwind: - SP_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - if (stub != NULL) { - call_stub_destroy (stub); - } - - return 0; -} - - -/* - * TODO: implement sending lookups for every fop done on this path. As of now - * lookup on the path is sent only for the first fop on this path. - */ -int32_t -sp_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) -{ - gf_dirent_t *dirent = NULL; - char entry_cached = 0; - uint64_t value = 0; - char xattr_req_empty = 1, can_wind = 0; - sp_cache_t *cache = NULL; - struct iatt postparent = {0, }, buf = {0, }; - int32_t ret = -1, op_ret = -1, op_errno = EINVAL; - sp_inode_ctx_t *inode_ctx = NULL, *parent_inode_ctx = NULL; - sp_local_t *local = NULL; - call_stub_t *stub = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc->inode, unwind); - - inode_ctx = sp_check_and_create_inode_ctx (this, loc->inode, - SP_DONT_CARE); - if (inode_ctx == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot create stat-prefetch context in inode " - "(gfid:%s)(%s)", - uuid_utoa (loc->inode->gfid), strerror (op_errno)); - goto unwind; - } - - if ((loc->parent == NULL) || (loc->name == NULL)) { - goto wind; - } - - if (xattr_req != NULL) { - dict_foreach (xattr_req, sp_is_empty, &xattr_req_empty); - } - - if (!xattr_req_empty) { - goto wind; - } - - cache = sp_get_cache_inode (this, loc->parent, frame->root->pid); - if (cache) { - ret = sp_cache_get_entry (cache, (char *)loc->name, &dirent); - if (ret == 0) { - if (!uuid_is_null (loc->inode->gfid) - && (uuid_compare (loc->inode->gfid, - dirent->d_stat.ia_gfid)) - != 0) { - op_ret = -1; - op_errno = ESTALE; - goto unwind; - } - - ret = inode_ctx_get (loc->parent, this, &value); - if ((ret == 0) && (value != 0)) { - parent_inode_ctx = (void *)(long)value; - postparent = parent_inode_ctx->stbuf; - buf = dirent->d_stat; - op_ret = 0; - op_errno = 0; - entry_cached = 1; - } - - GF_FREE (dirent); - } - } else if (IA_ISDIR (loc->inode->ia_type)) { - cache = sp_get_cache_inode (this, loc->inode, frame->root->pid); - if (cache) { - ret = sp_cache_get_entry (cache, ".", &dirent); - if (ret == 0) { - ret = inode_ctx_get (loc->parent, this, &value); - if ((ret == 0) && (value != 0)) { - parent_inode_ctx = (void *)(long)value; - postparent = parent_inode_ctx->stbuf; - buf = dirent->d_stat; - op_ret = 0; - op_errno = 0; - entry_cached = 1; - } - - GF_FREE (dirent); - } - } - } - -wind: - if (entry_cached) { - if (cache) { - cache->hits++; - sp_cache_unref (cache); - } - } else { - if (cache) { - cache->miss++; - sp_cache_unref (cache); - } - - stub = fop_lookup_stub (frame, sp_lookup_helper, loc, - xattr_req); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, stub, unwind, - op_errno, ENOMEM); - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, - op_errno, ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, - "loc_copy failed (%s)", strerror (op_errno)); - goto unwind; - } - - local->is_lookup = 1; - - LOCK (&inode_ctx->lock); - { - if (inode_ctx->lookup_in_progress) { - list_add_tail (&stub->list, - &inode_ctx->waiting_ops); - stub = NULL; - } else { - can_wind = 1; - inode_ctx->lookup_in_progress = 1; - } - } - UNLOCK (&inode_ctx->lock); - - if (can_wind) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, - xattr_req); - } - - if (stub != NULL) { - call_stub_destroy (stub); - } - - return 0; - } - -unwind: - SP_STACK_UNWIND (lookup, frame, op_ret, op_errno, (loc)?loc->inode:NULL, - &buf, NULL, &postparent); - - return 0; -} - - -int32_t -sp_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) -{ - sp_local_t *local = NULL; - sp_cache_t *cache = NULL; - fd_t *fd = NULL; - int32_t ret = 0; - char was_present = 1; - sp_private_t *priv = NULL; - - GF_ASSERT (frame); - if (op_ret == -1) { - goto out; - } - - if ((this == NULL) || (this->private == NULL)) { - gf_log (frame->this->name, GF_LOG_WARNING, - (this == NULL) ? "xlator object (this) is NULL" - : "stat-prefetch configuration (this->private) is " - "NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - local = frame->local; - if (local == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, "local is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - fd = local->fd; - - priv = this->private; - - LOCK (&priv->lock); - { - if (!priv->mem_pool) - priv->mem_pool = mem_pool_new (rbthash_entry_t, - GF_SP_CACHE_ENTRIES_EXPECTED); - } - UNLOCK (&priv->lock); - - if (!priv->mem_pool) - goto out; - - LOCK (&fd->lock); - { - cache = __sp_get_cache_fd (this, fd); - if (cache == NULL) { - was_present = 0; - cache = sp_cache_init (this); - if (cache == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "creation of stat-prefetch cache " - "for fd (%p) opened on inode " - "(gfid:%s) failed", fd, - uuid_utoa (fd->inode->gfid)); - goto unlock; - } - - ret = __sp_put_cache (this, fd, cache); - if (ret == -1) { - sp_cache_free (cache); - gf_log (this->name, GF_LOG_WARNING, - "cannot store cache in fd (%p) opened " - "on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unlock; - } - } - - sp_cache_ref (cache); - } -unlock: - UNLOCK (&fd->lock); - - if (cache != NULL) { - sp_cache_add_entries (cache, entries); - if (was_present) { - sp_cache_unref (cache); - } - } - -out: - SP_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); - return 0; -} - - -int32_t -sp_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) -{ - sp_cache_t *cache = NULL; - sp_local_t *local = NULL; - char *path = NULL; - int32_t ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - cache = sp_get_cache_fd (this, fd); - if (cache) { - if (off != cache->expected_offset) { - sp_cache_remove_entry (cache, NULL, 1); - } - - sp_cache_unref (cache); - } - - ret = inode_path (fd->inode, NULL, &path); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_WARNING, "cannot construct path on " - "which fd (%p) is opened (gfid = %s) (%s)", fd, - uuid_utoa (fd->inode->gfid), strerror (op_errno)); - goto unwind; - } - - ret = sp_cache_remove_parent_entry (frame, this, fd->inode->table, - path); - - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache" - " for path %s", path); - goto unwind; - } - - GF_FREE (path); - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - if (local) { - local->fd = fd; - frame->local = local; - } - - STACK_WIND (frame, sp_readdir_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdirp, fd, size, off); - - return 0; - -unwind: - if (path != NULL) { - GF_FREE (path); - } - - SP_STACK_UNWIND (readdir, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -sp_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - GF_ASSERT (frame); - - SP_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} - - - -int32_t -sp_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (rename, frame, op_ret, op_errno, buf, preoldparent, - postoldparent, prenewparent, postnewparent); - return 0; -} - - -int32_t -sp_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) -{ - sp_local_t *local = NULL; - sp_fd_ctx_t *fd_ctx = NULL; - - GF_ASSERT (frame); - - if (op_ret == -1) { - goto out; - } - - if (this == NULL) { - gf_log (frame->this ? frame->this->name : "stat-prefetch", - GF_LOG_WARNING, "xlator object (this) is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - if (fd == NULL) { - gf_log (this->name, GF_LOG_WARNING, "fd is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - fd_ctx = sp_fd_ctx_new (this, local->loc.parent, - (char *)local->loc.name, NULL); - if (fd_ctx == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - op_ret = fd_ctx_set (fd, this, (long)(void *)fd_ctx); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set stat-prefetch context in fd (%p) opened on " - "inode (gfid:%s)", fd, uuid_utoa (fd->inode->gfid)); - sp_fd_ctx_free (fd_ctx); - op_errno = ENOMEM; - } - -out: - SP_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - return 0; -} - - -int32_t -sp_open_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode " - "(gfid:%s)", uuid_utoa (loc->inode->gfid)); - op_errno = EINVAL; - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, inode_ctx, unwind, op_errno, - EINVAL); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if ((op_ret == -1) && ((op_errno != ENOENT) - || !((op_errno == ENOENT) - && (flags & O_CREAT)))) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding open call waiting on " - "it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_fd_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, wbflags); - - return 0; - -unwind: - SP_STACK_UNWIND (open, frame, -1, op_errno, fd); - return 0; -} - - -int32_t -sp_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int wbflags) -{ - call_stub_t *stub = NULL; - sp_local_t *local = NULL; - int32_t op_errno = EINVAL, ret = -1; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed (%s)", - strerror (op_errno)); - goto out; - } - - stub = fop_open_stub (frame, sp_open_helper, loc, flags, fd, wbflags); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); -out: - if (need_unwind) { - SP_STACK_UNWIND (open, frame, -1, op_errno, fd); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_fd_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, - wbflags); - } - - return 0; - -} - -static int32_t -sp_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - sp_local_t *local = NULL; - sp_fd_ctx_t *fd_ctx = NULL; - char lookup_in_progress = 0, looked_up = 0; - - GF_ASSERT (frame); - - if (op_ret == -1) { - goto out; - } - - if (this == NULL) { - gf_log (frame->this ? frame->this->name : "stat-prefetch", - GF_LOG_WARNING, "xlator object (this) is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - local = frame->local; - if (local == NULL) { - gf_log (this->name, GF_LOG_WARNING, "local is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - looked_up = 1; - op_ret = sp_update_inode_ctx (this, local->loc.inode, &op_ret, - &op_errno, &lookup_in_progress, - &looked_up, buf, NULL, &op_errno); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "updating stat-prefetch context in inode " - "(gfid:%s) (path: %s) failed (%s)", - uuid_utoa (local->loc.inode->gfid), local->loc.path, - strerror (op_errno)); - goto out; - } - - op_ret = sp_update_inode_ctx (this, local->loc.parent, NULL, NULL, NULL, - NULL, postparent, NULL, &op_errno); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "updating stat-prefetch context in parent inode failed " - "for path (%s)(%s)", local->loc.path, - strerror (op_errno)); - goto out; - } - - fd_ctx = sp_fd_ctx_new (this, local->loc.parent, - (char *)local->loc.name, NULL); - if (fd_ctx == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - op_ret = fd_ctx_set (fd, this, (long)(void *)fd_ctx); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot set stat-prefetch context in fd (%p) opened on " - "inode (gfid:%s)", fd, uuid_utoa (fd->inode->gfid)); - sp_fd_ctx_free (fd_ctx); - op_errno = ENOMEM; - } - -out: - SP_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; -} - - -int32_t -sp_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd, dict_t *params) -{ - sp_local_t *local = NULL; - int32_t op_errno = -1, ret = -1; - char need_unwind = 1; - sp_inode_ctx_t *inode_ctx = NULL; - - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, loc, out, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, loc->path, out, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, loc->name, out, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, loc->inode, out, - op_errno, EINVAL); - - ret = sp_cache_remove_parent_entry (frame, this, loc->inode->table, - (char *)loc->path); - if (ret == -1) { - op_errno = ENOMEM; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, - "loc_copy failed (%s)", strerror (op_errno)); - goto out; - } - - inode_ctx = sp_check_and_create_inode_ctx (this, loc->inode, - SP_DONT_EXPECT); - if (inode_ctx == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot create stat-prefetch context in inode " - "(gfid:%s)(%s)", uuid_utoa (loc->inode->gfid), - strerror (op_errno)); - goto out; - } - - need_unwind = 0; -out: - if (need_unwind) { - SP_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - } else { - STACK_WIND (frame, sp_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, loc, flags, - mode, fd, params); - } - return 0; -} - - -int32_t -sp_opendir_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode " - "(gfid:%s)", uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding opendir call waiting " - "on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_fd_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, loc, fd); - - return 0; - -unwind: - SP_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -sp_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) -{ - sp_local_t *local = NULL; - call_stub_t *stub = NULL; - int32_t op_errno = EINVAL, ret = -1; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed (%s)", - strerror (op_errno)); - goto out; - } - - stub = fop_opendir_stub (frame, sp_opendir_helper, loc, fd); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (opendir, frame, -1, op_errno, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_fd_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->opendir, loc, fd); - } - - return 0; -} - - -int32_t -sp_new_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - sp_local_t *local = NULL; - char lookup_in_progress = 0, looked_up = 0; - - GF_ASSERT (frame); - - if (op_ret == -1) { - goto out; - } - - local = frame->local; - if (local == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, "local is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - if (this == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "xlator object (this) is NULL"); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - looked_up = 1; - op_ret = sp_update_inode_ctx (this, local->loc.inode, &op_ret, - &op_errno, &lookup_in_progress, - &looked_up, buf, NULL, &op_errno); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "updating stat-prefetch context in inode " - "(gfid:%s) (path: %s) failed (%s)", - uuid_utoa (local->loc.inode->gfid), local->loc.path, - strerror (op_errno)); - goto out; - } - - op_ret = sp_update_inode_ctx (this, local->loc.parent, NULL, NULL, NULL, - NULL, postparent, NULL, &op_errno); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "updating stat-prefetch context in parent inode failed " - "for path (%s)(%s)", local->loc.path, - strerror (op_errno)); - } - -out: - SP_STACK_UNWIND (mkdir, frame, op_ret, op_errno, inode, buf, preparent, - postparent); - return 0; -} - - -int -sp_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dict_t *params) -{ - int32_t ret = -1, op_errno = EINVAL; - char need_unwind = 1; - sp_inode_ctx_t *inode_ctx = NULL; - sp_local_t *local = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->path, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - ret = sp_cache_remove_parent_entry (frame, this, loc->inode->table, - (char *)loc->path); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", loc->path); - op_errno = ENOMEM; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed (%s)", - strerror (op_errno)); - goto out; - } - - inode_ctx = sp_check_and_create_inode_ctx (this, loc->inode, - SP_DONT_EXPECT); - if (inode_ctx == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot create stat-prefetch context in inode " - "(gfid:%s)(%s)", uuid_utoa (loc->inode->gfid), - strerror (op_errno)); - goto out; - } - - need_unwind = 0; -out: - if (need_unwind) { - SP_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - } else { - STACK_WIND (frame, sp_new_entry_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mkdir, loc, mode, params); - } - - return 0; -} - - -int -sp_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, dict_t *params) -{ - int32_t op_errno = EINVAL, ret = -1; - char need_unwind = 1; - sp_inode_ctx_t *inode_ctx = NULL; - sp_local_t *local = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->path, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - ret = sp_cache_remove_parent_entry (frame, this, loc->inode->table, - (char *)loc->path); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", loc->path); - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed (%s)", - strerror (op_errno)); - goto out; - } - - inode_ctx = sp_check_and_create_inode_ctx (this, loc->inode, - SP_DONT_EXPECT); - if (inode_ctx == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot create stat-prefetch context in inode " - "(gfid:%s)(%s)", uuid_utoa (loc->inode->gfid), - strerror (op_errno)); - goto out; - } - - need_unwind = 0; -out: - if (need_unwind) { - SP_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - } else { - STACK_WIND (frame, sp_new_entry_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->mknod, loc, mode, - rdev, params); - } - - return 0; -} - - -int -sp_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, dict_t *params) -{ - int32_t ret = -1, op_errno = EINVAL; - char need_unwind = 1; - sp_inode_ctx_t *inode_ctx = NULL; - sp_local_t *local = NULL; - - GF_ASSERT (frame); - - GF_VALIDATE_OR_GOTO ((frame->this ? frame->this->name - : "stat-prefetch"), - this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->path, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - ret = sp_cache_remove_parent_entry (frame, this, loc->inode->table, - (char *)loc->path); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", loc->path); - op_errno = ENOMEM; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), gf_sp_mt_sp_local_t); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - ENOMEM); - - frame->local = local; - - ret = loc_copy (&local->loc, loc); - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_WARNING, "loc_copy failed (%s)", - strerror (op_errno)); - goto out; - } - - inode_ctx = sp_check_and_create_inode_ctx (this, loc->inode, - SP_DONT_EXPECT); - if (inode_ctx == NULL) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "%s: cannot create stat-prefetch context (gfid:%s)(%s)", - loc->path, loc->inode->gfid, strerror (op_errno)); - goto out; - } - - need_unwind = 0; -out: - if (need_unwind) { - SP_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - } else { - STACK_WIND (frame, sp_new_entry_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, linkpath, loc, - params); - } - - return 0; -} - - -int32_t -sp_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, preparent, - postparent); - return 0; -} - - -int32_t -sp_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, oldloc, unwind); - GF_VALIDATE_OR_GOTO (this->name, newloc, unwind); - - ret = inode_ctx_get (oldloc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (oldloc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, inode_ctx, unwind, op_errno, - EINVAL); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding link call waiting on " - "it", oldloc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_link_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, oldloc, newloc); - - return 0; - -unwind: - SP_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; -} - - -int32_t -sp_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc) -{ - call_stub_t *stub = NULL; - int32_t ret = 0, op_errno = EINVAL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, newloc, out); - GF_VALIDATE_OR_GOTO (this->name, newloc->path, out); - GF_VALIDATE_OR_GOTO (this->name, newloc->name, out); - GF_VALIDATE_OR_GOTO (this->name, newloc->inode, out); - GF_VALIDATE_OR_GOTO (this->name, oldloc->name, out); - - ret = sp_cache_remove_parent_entry (frame, this, newloc->parent->table, - (char *)newloc->path); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", newloc->path); - op_errno = ENOMEM; - goto out; - } - - sp_remove_caches_from_all_fds_opened (this, oldloc->parent, - (char *)oldloc->name); - - stub = fop_link_stub (frame, sp_link_helper, oldloc, newloc); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, oldloc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, oldloc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_link_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->link, oldloc, newloc); - } - - return 0; -} - - -int32_t -sp_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, inode_ctx, unwind, op_errno, - EINVAL); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding truncate call " - "waiting on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - - return 0; - -unwind: - SP_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_truncate_stub (frame, sp_truncate_helper, loc, offset); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - } - - return 0; -} - - -int32_t -sp_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0, op_errno = EINVAL; - inode_t *parent = NULL; - char *name = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - ret = fd_ctx_get (fd, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "stat-prefetch context not " - "set in fd (%p) opened on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } - - fd_ctx = (void *)(long)value; - name = fd_ctx->name; - parent = fd_ctx->parent_inode; - - sp_remove_caches_from_all_fds_opened (this, parent, (char *)name); - - STACK_WIND (frame, sp_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - return 0; - -unwind: - SP_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *prestat, struct iatt *poststat) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (setattr, frame, op_ret, op_errno, prestat, poststat); - return 0; -} - - -int -sp_setattr_helper (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - op_errno = EINVAL; - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding setattr call " - "waiting on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, buf, valid); - - return 0; - -unwind: - SP_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int -sp_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_setattr_stub (frame, sp_setattr_helper, loc, buf, valid); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, buf, valid); - } - - return 0; -} - - -int32_t -sp_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, const char *path, - struct iatt *buf) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf); - return 0; -} - - -int32_t -sp_readlink_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - size_t size) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding readlink call " - "waiting on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_readlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, loc, size); - - return 0; - -unwind: - SP_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_readlink_stub (frame, sp_readlink_helper, loc, size); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_readlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, loc, size); - } - - return 0; -} - - -int32_t -sp_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent, - postparent); - return 0; -} - - - -int32_t -sp_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (setxattr, frame, op_ret, op_errno); - return 0; -} - - -int32_t -sp_unlink_helper (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding unlink call " - "waiting on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc); - - return 0; - -unwind: - SP_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - int32_t ret = -1, op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - ret = sp_cache_remove_parent_entry (frame, this, loc->parent->table, - (char *)loc->path); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", loc->path); - goto out; - } - - stub = fop_unlink_stub (frame, sp_unlink_helper, loc); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, loc); - } - - return 0; -} - - -int -sp_rmdir_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "lookup-behind has failed " - "for path (%s)(%s), unwinding rmdir call " - "waiting on it", loc->path, strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, loc, flags); - - return 0; - -unwind: - SP_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int -sp_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags) -{ - int32_t ret = -1, op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - GF_VALIDATE_OR_GOTO (this->name, loc->path, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - sp_remove_caches_from_all_fds_opened (this, loc->inode, NULL); - - ret = sp_cache_remove_parent_entry (frame, this, loc->inode->table, - (char *)loc->path); - if (ret == -1) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", loc->path); - goto out; - } - - stub = fop_rmdir_stub (frame, sp_rmdir_helper, loc, flags); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rmdir, loc, flags); - } - - return 0; -} - - -int32_t -sp_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobref) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref); - return 0; -} - - -int32_t -sp_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0, op_errno = EINVAL; - inode_t *parent = NULL; - char *name = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - ret = fd_ctx_get (fd, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "stat-prefetch context not " - "set in fd (%p) opened on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } - - fd_ctx = (void *)(long)value; - name = fd_ctx->name; - parent = fd_ctx->parent_inode; - - sp_remove_caches_from_all_fds_opened (this, parent, (char *)name); - - STACK_WIND (frame, sp_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset); - return 0; - -unwind: - SP_STACK_UNWIND (readv, frame, -1, op_errno, NULL, -1, NULL, NULL); - return 0; -} - - -int32_t -sp_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t off, struct iobref *iobref) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0, op_errno = EINVAL; - inode_t *parent = NULL; - char *name = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - ret = fd_ctx_get (fd, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "stat-prefetch context not " - "set in fd (%p) opened on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } - - fd_ctx = (void *)(long)value; - name = fd_ctx->name; - parent = fd_ctx->parent_inode; - - sp_remove_caches_from_all_fds_opened (this, parent, (char *)name); - - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, fd, vector, count, off, - iobref); - return 0; - -unwind: - SP_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0, op_errno = EINVAL; - inode_t *parent = NULL; - char *name = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - ret = fd_ctx_get (fd, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "stat-prefetch context not " - "set in fd (%p) opened on inode (gfid:%s)", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } - - fd_ctx = (void *)(long)value; - name = fd_ctx->name; - parent = fd_ctx->parent_inode; - - sp_remove_caches_from_all_fds_opened (this, parent, (char *)name); - - STACK_WIND (frame, sp_unlink_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, flags); - return 0; - -unwind: - SP_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL); - return 0; -} - - -int32_t -sp_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) -{ - uint64_t value = 0; - char need_unwind = 0; - char can_wind = 0; - int32_t ret = 0, op_errno = EINVAL; - int32_t old_op_ret = -1, old_op_errno = -1; - int32_t new_op_ret = -1, new_op_errno = -1; - char old_inode_looked_up = 0, new_inode_looked_up = 0; - sp_inode_ctx_t *old_inode_ctx = NULL, *new_inode_ctx = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, unwind); - GF_VALIDATE_OR_GOTO (this->name, oldloc, unwind); - GF_VALIDATE_OR_GOTO (this->name, newloc, unwind); - - ret = inode_ctx_get (oldloc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (oldloc->inode->gfid)); - goto unwind; - } - - old_inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, old_inode_ctx, unwind); - - LOCK (&old_inode_ctx->lock); - { - old_inode_looked_up = old_inode_ctx->looked_up; - old_op_ret = old_inode_ctx->op_ret; - old_op_errno = old_inode_ctx->op_errno; - need_unwind = old_inode_ctx->need_unwind; - } - UNLOCK (&old_inode_ctx->lock); - - if (need_unwind) { - /* there was an error while queuing up lookup stub for newloc */ - gf_log (this->name, GF_LOG_WARNING, - "could not queue lookup stub for path (%s)", - newloc->path); - goto unwind; - } - - if (newloc->inode != NULL) { - ret = inode_ctx_get (newloc->inode, this, &value); - if (ret == 0) { - new_inode_ctx = (sp_inode_ctx_t *)(long)value; - if (new_inode_ctx != NULL) { - LOCK (&new_inode_ctx->lock); - { - new_inode_looked_up - = new_inode_ctx->looked_up; - new_op_ret = new_inode_ctx->op_ret; - new_op_errno = new_inode_ctx->op_errno; - } - UNLOCK (&new_inode_ctx->lock); - } - } - } - - if (new_inode_ctx == NULL) { - if (old_op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed " - "for path (%s)(%s), unwinding rename call " - "waiting on it", oldloc->path, - strerror (old_op_errno)); - - op_errno = old_op_errno; - goto unwind; - } else { - can_wind = 1; - } - } else { - if (new_inode_looked_up && old_inode_looked_up) { - if ((old_op_ret == -1) - || ((new_op_ret == -1) - && (new_op_errno != ENOENT))) { - if (old_op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed " - "for path (%s)(%s), unwinding " - "rename call waiting on it", - oldloc->path, - strerror (old_op_errno)); - op_errno = old_op_errno; - } else { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed " - "for path (%s)(%s), unwinding " - "rename call waiting on it", - newloc->path, - strerror (new_op_errno)); - op_errno = new_op_errno; - } - - goto unwind; - } else { - can_wind = 1; - } - } - } - - if (can_wind) { - STACK_WIND (frame, sp_rename_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, oldloc, newloc); - } - - return 0; - -unwind: - SP_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL); - return 0; -} - - -int32_t -sp_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,loc_t *newloc) -{ - char need_unwind = 1; - uint64_t value = 0; - call_stub_t *stub = NULL; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = -1, op_errno = EINVAL; - char old_inode_can_wind = 0, new_inode_can_wind = 0; - char old_inode_need_lookup = 0, new_inode_need_lookup = 0; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this ? frame->this->name : "stat-prefetch", - this, out); - GF_VALIDATE_OR_GOTO (this->name, oldloc, out); - GF_VALIDATE_OR_GOTO (this->name, oldloc->path, out); - GF_VALIDATE_OR_GOTO (this->name, oldloc->name, out); - GF_VALIDATE_OR_GOTO (this->name, oldloc->inode, out); - - GF_VALIDATE_OR_GOTO (this->name, newloc, out); - GF_VALIDATE_OR_GOTO (this->name, newloc->path, out); - - sp_remove_caches_from_all_fds_opened (this, oldloc->parent, - (char *)oldloc->name); - - sp_remove_caches_from_all_fds_opened (this, newloc->parent, - (char *)newloc->name); - - ret = sp_cache_remove_parent_entry (frame, this, oldloc->parent->table, - (char *)oldloc->path); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", oldloc->path); - goto out; - } - - ret = sp_cache_remove_parent_entry (frame, this, newloc->parent->table, - (char *)newloc->path); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "cannot remove parent entry from grand-parent's cache " - "for path (%s)", newloc->path); - goto out; - } - - if (IA_ISDIR (oldloc->inode->ia_type)) { - sp_remove_caches_from_all_fds_opened (this, oldloc->inode, - NULL); - } - - stub = fop_rename_stub (frame, sp_rename_helper, oldloc, newloc); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - ret = sp_process_inode_ctx (frame, this, oldloc, stub, &need_unwind, - &old_inode_need_lookup, &old_inode_can_wind, - &op_errno); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "processing stat-prefetch " - "context in inode (gfid:%s) (path:%s) failed (%s)", - uuid_utoa (oldloc->inode->gfid), oldloc->path, - strerror (op_errno)); - goto out; - } - - if (newloc->inode != NULL) { - stub = fop_rename_stub (frame, sp_rename_helper, oldloc, - newloc); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - ret = sp_process_inode_ctx (frame, this, newloc, stub, - &need_unwind, - &new_inode_need_lookup, - &new_inode_can_wind, &op_errno); - if (ret == -1) { - ret = inode_ctx_get (oldloc->inode, this, &value); - - inode_ctx = (sp_inode_ctx_t *)(long)value; - if (inode_ctx == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode" - " (gfid:%s) (path:%s)", - uuid_utoa (oldloc->inode->gfid), - oldloc->path); - goto out; - } - - LOCK (&inode_ctx->lock); - { - if (!inode_ctx->looked_up) { - /* unwind in sp_rename_helper */ - need_unwind = 0; - inode_ctx->need_unwind = 1; - } - } - UNLOCK (&inode_ctx->lock); - } - - } else { - new_inode_can_wind = 1; - } - -out: - if (need_unwind) { - SP_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - } else if (old_inode_need_lookup || new_inode_need_lookup) { - if (old_inode_need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, oldloc, - NULL); - } - - if (new_inode_need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, newloc, - NULL); - } - } else if (old_inode_can_wind && new_inode_can_wind) { - STACK_WIND (frame, sp_rename_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, oldloc, newloc); - } - - return 0; -} - - -int32_t -sp_setxattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - dict_t *dict, int32_t flags) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding setxattr call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, - flags); - - return 0; - -unwind: - SP_STACK_UNWIND (setxattr, frame, -1, op_errno); - return 0; -} - - -int32_t -sp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_setxattr_stub (frame, sp_setxattr_helper, loc, dict, flags); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (setxattr, frame, -1, op_errno); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setxattr, loc, dict, - flags); - } - - return 0; -} - - -int32_t -sp_removexattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding setxattr call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name); - - return 0; - -unwind: - SP_STACK_UNWIND (removexattr, frame, -1, op_errno); - return 0; -} - - -int32_t -sp_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_removexattr_stub (frame, sp_removexattr_helper, loc, name); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (removexattr, frame, -1, op_errno); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->removexattr, loc, name); - } - - return 0; -} - - -int32_t -sp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - return 0; -} - - -int32_t -sp_getxattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding getxattr call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); - - return 0; - -unwind: - SP_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -sp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - stub = fop_getxattr_stub (frame, sp_getxattr_helper, loc, name); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_getxattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name); - } - - return 0; -} - - -int32_t -sp_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict); - return 0; -} - - -int32_t -sp_xattrop_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding xattrop call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_xattrop_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, loc, flags, dict); - - return 0; - -unwind: - SP_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -sp_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); - - sp_remove_caches_from_all_fds_opened (this, loc->parent, - (char *)loc->name); - - stub = fop_xattrop_stub (frame, sp_xattrop_helper, loc, flags, dict); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_xattrop_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->xattrop, loc, flags, dict); - } - - return 0; -} - - -int32_t -sp_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t flags, dict_t *dict) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0, op_errno = EINVAL; - inode_t *parent = NULL; - char *name = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, fd, unwind); - - ret = fd_ctx_get (fd, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, "stat-prefetch context not " - "set in fd (%p) opened on inode (gfid:%s", fd, - uuid_utoa (fd->inode->gfid)); - goto unwind; - } - - fd_ctx = (void *)(long)value; - name = fd_ctx->name; - parent = fd_ctx->parent_inode; - - sp_remove_caches_from_all_fds_opened (this, parent, name); - - STACK_WIND (frame, sp_xattrop_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict); - return 0; - -unwind: - SP_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL); - return 0; -} - -int32_t -sp_stbuf_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *buf) -{ - GF_ASSERT (frame); - SP_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); - return 0; -} - - -int32_t -sp_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding stat call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_stbuf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc); - - return 0; - -unwind: - SP_STACK_UNWIND (stat, frame, -1, op_errno, NULL); - return 0; -} - - -int32_t -sp_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - int32_t op_errno = -1; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - stub = fop_stat_stub (frame, sp_stat_helper, loc); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (stat, frame, -1, op_errno, NULL); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_stbuf_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc); - } - - return 0; -} - - -int32_t -sp_access_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding access call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->access, loc, mask); - - return 0; - -unwind: - SP_STACK_UNWIND (access, frame, -1, op_errno); - return 0; -} - - -int32_t -sp_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) -{ - int32_t op_errno = -1; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - stub = fop_access_stub (frame, sp_access_helper, loc, mask); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (access, frame, -1, op_errno); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->access, loc, mask); - } - - return 0; -} - - -int32_t -sp_inodelk_helper (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding inodelk call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, lock); - - return 0; - -unwind: - SP_STACK_UNWIND (inodelk, frame, -1, op_errno); - return 0; -} - - -int32_t -sp_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, - int32_t cmd, struct gf_flock *lock) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - stub = fop_inodelk_stub (frame, sp_inodelk_helper, volume, loc, cmd, - lock); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (inodelk, frame, -1, op_errno); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, - lock); - } - - return 0; -} - - -int32_t -sp_entrylk_helper (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, const char *basename, entrylk_cmd cmd, - entrylk_type type) -{ - uint64_t value = 0; - sp_inode_ctx_t *inode_ctx = NULL; - int32_t ret = 0, op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, unwind); - GF_VALIDATE_OR_GOTO (this->name, loc, unwind); - - ret = inode_ctx_get (loc->inode, this, &value); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "stat-prefetch context not set in inode (gfid:%s)", - uuid_utoa (loc->inode->gfid)); - goto unwind; - } - - inode_ctx = (sp_inode_ctx_t *)(long) value; - GF_VALIDATE_OR_GOTO (this->name, inode_ctx, unwind); - - LOCK (&inode_ctx->lock); - { - op_ret = inode_ctx->op_ret; - op_errno = inode_ctx->op_errno; - } - UNLOCK (&inode_ctx->lock); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "lookup-behind has failed for path (%s)(%s), " - "unwinding entrylk call waiting on it", loc->path, - strerror (op_errno)); - goto unwind; - } - - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, - cmd, type); - - return 0; - -unwind: - SP_STACK_UNWIND (entrylk, frame, -1, op_errno); - return 0; -} - - -int32_t -sp_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) -{ - int32_t op_errno = EINVAL; - call_stub_t *stub = NULL; - char can_wind = 0, need_lookup = 0, need_unwind = 1; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); - GF_VALIDATE_OR_GOTO (this->name, loc->inode, out); - - stub = fop_entrylk_stub (frame, sp_entrylk_helper, volume, loc, - basename, cmd, type); - if (stub == NULL) { - op_errno = ENOMEM; - goto out; - } - - sp_process_inode_ctx (frame, this, loc, stub, &need_unwind, - &need_lookup, &can_wind, &op_errno); - -out: - if (need_unwind) { - SP_STACK_UNWIND (entrylk, frame, -1, op_errno); - } else if (need_lookup) { - STACK_WIND (frame, sp_lookup_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, loc, NULL); - } else if (can_wind) { - STACK_WIND (frame, sp_err_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->entrylk, volume, loc, - basename, cmd, type); - } - - return 0; -} - - -int32_t -sp_forget (xlator_t *this, inode_t *inode) -{ - struct iatt *buf = NULL; - uint64_t value = 0; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - inode_ctx_del (inode, this, &value); - - if (value) { - buf = (void *)(long)value; - GF_FREE (buf); - } - -out: - return 0; -} - - -int32_t -sp_release (xlator_t *this, fd_t *fd) -{ - sp_fd_ctx_t *fd_ctx = NULL; - uint64_t value = 0; - int32_t ret = 0; - sp_cache_t *cache = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); - ret = fd_ctx_del (fd, this, &value); - if (!ret) { - fd_ctx = (void *)(long) value; - cache = fd_ctx->cache; - if (cache) { - gf_log (this->name, GF_LOG_TRACE, "cache hits: %lu, " - "cache miss: %lu", cache->hits, cache->miss); - } - - sp_fd_ctx_free (fd_ctx); - } - -out: - return 0; -} - - -struct sp_cache_dump { - int i; - char *key_prefix; -}; -typedef struct sp_cache_dump sp_cache_dump_t; - -void -sp_cache_traverse (void *data, void *mydata) -{ - gf_dirent_t *dirent = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char uuidbuf[256] = {0, }; - sp_cache_dump_t *dump = NULL; - - if ((data == NULL) || (mydata == NULL)) { - goto out; - } - - dirent = data; - dump = mydata; - - gf_proc_dump_build_key (key, dump->key_prefix, "entry[%d].name", - dump->i); - gf_proc_dump_write (key, "%s", dirent->d_name); - - uuid_unparse (dirent->d_stat.ia_gfid, uuidbuf); - gf_proc_dump_build_key (key, dump->key_prefix, "entry[%d].inode.gfid", - dump->i); - gf_proc_dump_write (key, "%s", uuidbuf); - - dump->i++; -out: - return; -} - - -int32_t -sp_fdctx_dump (xlator_t *this, fd_t *fd) -{ - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - char uuidbuf[256] = {0, }; - sp_cache_t *cache = NULL; - int32_t ret = -1; - sp_cache_dump_t *dump = NULL; - char *parent = NULL; - - cache = sp_get_cache_fd (this, fd); - if (cache == NULL) { - ret = 0; - goto out; - } - - dump = GF_CALLOC(1, sizeof (*dump), gf_common_mt_char); - if (dump == NULL) { - goto out; - } - - gf_proc_dump_build_key (key_prefix, - "xlator.performance.stat-prefetch", - "fdctx"); - gf_proc_dump_add_section (key_prefix); - - gf_proc_dump_build_key (key, key_prefix, "fd"); - gf_proc_dump_write (key, "%p", fd); - - ret = __inode_path (fd->inode, NULL, &parent); - if (parent != NULL) { - gf_proc_dump_build_key (key, key_prefix, "name"); - gf_proc_dump_write (key, "%s", parent); - GF_FREE (parent); - } - - uuid_unparse (fd->inode->gfid, uuidbuf); - gf_proc_dump_build_key (key, key_prefix, "fd.inode.gfid"); - gf_proc_dump_write (key, "%s", uuidbuf); - - gf_proc_dump_build_key (key, key_prefix, "miss"); - gf_proc_dump_write (key, "%lu", cache->miss); - - gf_proc_dump_build_key (key, key_prefix, "hits"); - gf_proc_dump_write (key, "%lu", cache->hits); - - gf_proc_dump_build_key (key, key_prefix, "cache"); - dump->key_prefix = key; - - rbthash_table_traverse (cache->table, sp_cache_traverse, dump); - - GF_FREE (dump); - ret = 0; -out: - return ret; -} - -int32_t -sp_inodectx_dump (xlator_t *this, inode_t *inode) -{ - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - sp_inode_ctx_t *inode_ctx = NULL; - call_stub_t *stub = NULL; - uint64_t value = 0; - int32_t ret = -1, i = 0; - - if ((this == NULL) || (inode == NULL)) { - goto out; - } - - ret = inode_ctx_get (inode, this, &value); - if (ret == 0) { - inode_ctx = (sp_inode_ctx_t *)(long)value; - } - - if (inode_ctx == NULL) { - goto out; - } - - gf_proc_dump_build_key (key_prefix, - "stat-prefetch", - "inodectx"); - gf_proc_dump_add_section (key_prefix); - - LOCK (&inode_ctx->lock); - { - gf_proc_dump_write ("looked_up", "%s", - inode_ctx->looked_up ? "yes" : "no"); - - gf_proc_dump_write ("lookup_in_progress", "%s", - inode_ctx->lookup_in_progress ? - "yes" : "no"); - - gf_proc_dump_write ("need_unwind", "%s", inode_ctx->need_unwind ? - "yes" : "no"); - - gf_proc_dump_write ("op_ret", "%d", inode_ctx->op_ret); - - gf_proc_dump_write ("op_errno", "%d", inode_ctx->op_errno); - - list_for_each_entry (stub, &inode_ctx->waiting_ops, list) { - gf_proc_dump_build_key (key, "", - "waiting-ops[%d].frame", i); - gf_proc_dump_write (key, "%"PRId64, - stub->frame->root->unique); - - gf_proc_dump_build_key (key, "", - "waiting-ops[%d].fop", i); - gf_proc_dump_write (key, "%s", gf_fop_list[stub->fop]); - - i++; - } - } - UNLOCK (&inode_ctx->lock); -out: - return ret; -} - -int -sp_priv_dump (xlator_t *this) -{ - sp_private_t *priv = NULL; - uint32_t total_entries = 0; - uint32_t ret = -1; - char key[GF_DUMP_MAX_BUF_LEN]; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - GF_VALIDATE_OR_GOTO (this->name, this->private, out); - - priv = this->private; - - total_entries = priv->entries; - - gf_proc_dump_build_key (key_prefix, "xlator.performance.stat-prefetch", - "priv"); - gf_proc_dump_add_section (key_prefix); - - gf_proc_dump_build_key (key, key_prefix, "max_allowed_entries"); - gf_proc_dump_write (key, "%lu", GF_SP_CACHE_ENTRIES_EXPECTED); - gf_proc_dump_build_key (key, key_prefix, "num_entries_cached"); - gf_proc_dump_write (key, "%lu",(unsigned long)total_entries); - ret = 0; - -out: - return ret; -} - - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - - ret = xlator_mem_acct_init (this, gf_sp_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - goto out; - } - -out: - return ret; -} - - -int32_t -init (xlator_t *this) -{ - int32_t ret = -1; - sp_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO ("stat-prefetch", this, out); - - if (!this->children || this->children->next) { - gf_log ("stat-prefetch", - GF_LOG_ERROR, - "FATAL: translator %s does not have exactly one child " - "node", this->name); - goto out; - } - - priv = GF_CALLOC (1, sizeof(sp_private_t), gf_sp_mt_sp_private_t); - LOCK_INIT (&priv->lock); - - this->private = priv; - - ret = 0; -out: - return ret; -} - -void -fini (xlator_t *this) -{ - sp_private_t *priv = NULL; - - if (!this) - goto out; - else { - priv = this->private; - if (priv) { - if (priv->mem_pool) - mem_pool_destroy (priv->mem_pool); - LOCK_DESTROY (&priv->lock); - GF_FREE (priv); - this->private = NULL; - } - } -out: - return; -} - - -struct xlator_fops fops = { - .lookup = sp_lookup, - .readdir = sp_readdir, - .readdirp = sp_readdir, - .open = sp_open, - .create = sp_create, - .opendir = sp_opendir, - .mkdir = sp_mkdir, - .mknod = sp_mknod, - .symlink = sp_symlink, - .link = sp_link, - .truncate = sp_truncate, - .ftruncate = sp_ftruncate, - .readlink = sp_readlink, - .unlink = sp_unlink, - .rmdir = sp_rmdir, - .readv = sp_readv, - .writev = sp_writev, - .fsync = sp_fsync, - .rename = sp_rename, - .setxattr = sp_setxattr, - .removexattr = sp_removexattr, - .xattrop = sp_xattrop, - .fxattrop = sp_fxattrop, - .setattr = sp_setattr, - .stat = sp_stat, - .access = sp_access, - .getxattr = sp_getxattr, - .inodelk = sp_inodelk, - .entrylk = sp_entrylk, -}; - -struct xlator_cbks cbks = { - .forget = sp_forget, - .release = sp_release, - .releasedir = sp_release -}; - -struct xlator_dumpops dumpops = { - .priv = sp_priv_dump, - .inodectx = sp_inodectx_dump, - .fdctx = sp_fdctx_dump -}; diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h deleted file mode 100644 index ed84719e407..00000000000 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - Copyright (c) 2009-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef _STAT_PREFETCH_H -#define _STAT_PREFETCH_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "locking.h" -#include "inode.h" -#include "glusterfs.h" -#include "dict.h" -#include "xlator.h" -#include "rbthash.h" -#include "hashfn.h" -#include "call-stub.h" -#include "stat-prefetch-mem-types.h" -#include <libgen.h> - -struct sp_cache { - rbthash_table_t *table; - xlator_t *this; - uint64_t expected_offset; /* Offset where the next read will - * happen. - */ - gf_lock_t lock; - unsigned long miss; - unsigned long hits; - uint32_t ref; -}; -typedef struct sp_cache sp_cache_t; - -struct sp_fd_ctx { - sp_cache_t *cache; - inode_t *parent_inode; /* - * inode corresponding to dirname (path) - */ - char *name; /* - * basename of path on which this fd is - * opened - */ -}; -typedef struct sp_fd_ctx sp_fd_ctx_t; - -struct sp_local { - loc_t loc; - fd_t *fd; - char is_lookup; -}; -typedef struct sp_local sp_local_t; - -struct sp_inode_ctx { - char looked_up; - char lookup_in_progress; - char need_unwind; - int32_t op_ret; - int32_t op_errno; - struct iatt stbuf; - gf_lock_t lock; - struct list_head waiting_ops; -}; -typedef struct sp_inode_ctx sp_inode_ctx_t; - -struct sp_private { - struct mem_pool *mem_pool; - uint32_t entries; - gf_lock_t lock; -}; -typedef struct sp_private sp_private_t; - -void sp_local_free (sp_local_t *local); - -#define SP_STACK_UNWIND(op, frame, params ...) do { \ - sp_local_t *__local = frame->local; \ - frame->local = NULL; \ - STACK_UNWIND_STRICT (op, frame, params); \ - sp_local_free (__local); \ - } while (0) - -#define SP_STACK_DESTROY(frame) do { \ - sp_local_t *__local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - sp_local_free (__local); \ - } while (0) - -#endif /* #ifndef _STAT_PREFETCH_H */ diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am deleted file mode 100644 index 06e85fc9216..00000000000 --- a/xlators/performance/symlink-cache/src/Makefile.am +++ /dev/null @@ -1,12 +0,0 @@ -xlator_LTLIBRARIES = symlink-cache.la -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/performance - -symlink_cache_la_LDFLAGS = -module -avoidversion - -symlink_cache_la_SOURCES = symlink-cache.c -symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = diff --git a/xlators/performance/symlink-cache/src/symlink-cache.c b/xlators/performance/symlink-cache/src/symlink-cache.c deleted file mode 100644 index a82786cce0b..00000000000 --- a/xlators/performance/symlink-cache/src/symlink-cache.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" - -struct symlink_cache { - time_t ctime; - char *readlink; -}; - - -static int -symlink_inode_ctx_get (inode_t *inode, xlator_t *this, void **ctx) -{ - int ret = 0; - uint64_t tmp_ctx = 0; - ret = inode_ctx_get (inode, this, &tmp_ctx); - if (-1 == ret) - gf_log (this->name, GF_LOG_ERROR, "dict get failed"); - else - *ctx = (void *)(long)tmp_ctx; - - return 0; -} - - -static int -symlink_inode_ctx_set (inode_t *inode, xlator_t *this, void *ctx) -{ - int ret = 0; - ret = inode_ctx_put (inode, this, (uint64_t)(long) ctx); - if (-1 == ret) - gf_log (this->name, GF_LOG_ERROR, "dict set failed"); - - return 0; -} - - -int -sc_cache_update (xlator_t *this, inode_t *inode, const char *link) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) - return 0; - - if (!sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "updating cache: %s", link); - - sc->readlink = strdup (link); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "not updating existing cache: %s with %s", - sc->readlink, link); - } - - return 0; -} - - -int -sc_cache_set (xlator_t *this, inode_t *inode, struct iatt *buf, - const char *link) -{ - struct symlink_cache *sc = NULL; - int ret = -1; - int need_set = 0; - - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) { - need_set = 1; - sc = CALLOC (1, sizeof (*sc)); - if (!sc) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - goto err; - } - } - - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "replacing old cache: %s with new cache: %s", - sc->readlink, link); - FREE (sc->readlink); - sc->readlink = NULL; - } - - if (link) { - sc->readlink = strdup (link); - if (!sc->readlink) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - goto err; - } - } - - sc->ctime = buf->ia_ctime; - - gf_log (this->name, GF_LOG_DEBUG, - "setting symlink cache: %s", link); - - if (need_set) { - ret = symlink_inode_ctx_set (inode, this, sc); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set inode context (%s)", - strerror (-ret)); - goto err; - } - } - - return 0; -err: - - if (sc) { - if (sc->readlink) - FREE (sc->readlink); - sc->readlink = NULL; - FREE (sc); - } - - return -1; -} - - -int -sc_cache_flush (xlator_t *this, inode_t *inode) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - if (!sc) - return 0; - - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "flushing cache: %s", sc->readlink); - - FREE (sc->readlink); - sc->readlink = NULL; - } - - FREE (sc); - - return 0; -} - - -int -sc_cache_validate (xlator_t *this, inode_t *inode, struct iatt *buf) -{ - struct symlink_cache *sc = NULL; - uint64_t tmp_sc = 0; - - if (!IA_ISLNK (buf->ia_type)) { - sc_cache_flush (this, inode); - return 0; - } - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - - if (!sc) { - sc_cache_set (this, inode, buf, NULL); - inode_ctx_get (inode, this, &tmp_sc); - - if (!sc) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory :("); - return 0; - } - sc = (struct symlink_cache *)(long)tmp_sc; - } - - if (sc->ctime == buf->ia_ctime) - return 0; - - /* STALE */ - if (sc->readlink) { - gf_log (this->name, GF_LOG_DEBUG, - "flushing cache: %s", sc->readlink); - - FREE (sc->readlink); - sc->readlink = NULL; - } - - sc->ctime = buf->ia_ctime; - - return 0; -} - - - -int -sc_cache_get (xlator_t *this, inode_t *inode, char **link) -{ - struct symlink_cache *sc = NULL; - - symlink_inode_ctx_get (inode, this, VOID(&sc)); - - if (!sc) - return 0; - - if (link && sc->readlink) - *link = strdup (sc->readlink); - return 0; -} - - -int -sc_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - const char *link, struct iatt *sbuf) -{ - if (op_ret > 0) - sc_cache_update (this, frame->local, link); - - inode_unref (frame->local); - frame->local = NULL; - - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, link, sbuf); - return 0; -} - - -int -sc_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) -{ - char *link = NULL; - struct iatt buf = {0, }; - - sc_cache_get (this, loc->inode, &link); - - if (link) { - /* cache hit */ - gf_log (this->name, GF_LOG_DEBUG, - "cache hit %s -> %s", - loc->path, link); - - /* - libglusterfsclient, nfs or any other translators - using buf in readlink_cbk should be aware that @buf - is 0 filled - */ - STACK_UNWIND_STRICT (readlink, frame, strlen (link), 0, link, &buf); - FREE (link); - return 0; - } - - frame->local = inode_ref (loc->inode); - - STACK_WIND (frame, sc_readlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readlink, - loc, size); - - return 0; -} - - -int -sc_symlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - if (op_ret == 0) { - if (frame->local) { - sc_cache_set (this, inode, buf, frame->local); - } - } - - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, preparent, - postparent); - return 0; -} - - -int -sc_symlink (call_frame_t *frame, xlator_t *this, - const char *dst, loc_t *src, dict_t *params) -{ - frame->local = strdup (dst); - - STACK_WIND (frame, sc_symlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->symlink, - dst, src, params); - - return 0; -} - - -int -sc_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - if (op_ret == 0) - sc_cache_validate (this, inode, buf); - else - sc_cache_flush (this, inode); - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xattr, postparent); - return 0; -} - - -int -sc_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - STACK_WIND (frame, sc_lookup_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lookup, - loc, xattr_req); - - return 0; -} - - -int -sc_forget (xlator_t *this, - inode_t *inode) -{ - sc_cache_flush (this, inode); - - return 0; -} - - -int32_t -init (xlator_t *this) -{ - - if (!this->children || this->children->next) - { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: volume (%s) not configured with exactly one " - "child", this->name); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - return 0; -} - - -void -fini (xlator_t *this) -{ - return; -} - - -struct xlator_fops fops = { - .lookup = sc_lookup, - .symlink = sc_symlink, - .readlink = sc_readlink, -}; - - -struct xlator_cbks cbks = { - .forget = sc_forget, -}; - -struct volume_options options[] = { - { .key = {NULL} }, -}; diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am index a5ebc90bdca..a6a16fcc080 100644 --- a/xlators/performance/write-behind/src/Makefile.am +++ b/xlators/performance/write-behind/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = write-behind.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -write_behind_la_LDFLAGS = -module -avoidversion +write_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) write_behind_la_SOURCES = write-behind.c write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = write-behind-mem-types.h +noinst_HEADERS = write-behind-mem-types.h write-behind-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/write-behind/src/write-behind-mem-types.h b/xlators/performance/write-behind/src/write-behind-mem-types.h index 5a3ee4aed0f..a0647299150 100644 --- a/xlators/performance/write-behind/src/write-behind-mem-types.h +++ b/xlators/performance/write-behind/src/write-behind-mem-types.h @@ -1,35 +1,24 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __WB_MEM_TYPES_H__ #define __WB_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_wb_mem_types_ { - gf_wb_mt_wb_file_t = gf_common_mt_end + 1, - gf_wb_mt_wb_local_t, - gf_wb_mt_wb_request_t, - gf_wb_mt_iovec, - gf_wb_mt_wb_conf_t, - gf_wb_mt_end + gf_wb_mt_wb_file_t = gf_common_mt_end + 1, + gf_wb_mt_wb_request_t, + gf_wb_mt_iovec, + gf_wb_mt_wb_conf_t, + gf_wb_mt_wb_inode_t, + gf_wb_mt_end }; #endif - diff --git a/xlators/performance/write-behind/src/write-behind-messages.h b/xlators/performance/write-behind/src/write-behind-messages.h new file mode 100644 index 00000000000..e9ea474879b --- /dev/null +++ b/xlators/performance/write-behind/src/write-behind-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _WRITE_BEHIND_MESSAGES_H_ +#define _WRITE_BEHIND_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(WRITE_BEHIND, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE, + WRITE_BEHIND_MSG_INIT_FAILED, WRITE_BEHIND_MSG_INVALID_ARGUMENT, + WRITE_BEHIND_MSG_NO_MEMORY, WRITE_BEHIND_MSG_SIZE_NOT_SET, + WRITE_BEHIND_MSG_VOL_MISCONFIGURED, + WRITE_BEHIND_MSG_RES_UNAVAILABLE); + +#endif /* _WRITE_BEHIND_MESSAGES_H_ */ diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 52e03872026..00cfca016e6 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -1,3021 +1,3278 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -/*TODO: check for non null wb_file_data before getting wb_file */ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "list.h" -#include "compat.h" -#include "compat-errno.h" -#include "common-utils.h" -#include "call-stub.h" -#include "statedump.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/list.h> +#include <glusterfs/compat.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/statedump.h> +#include <glusterfs/defaults.h> #include "write-behind-mem-types.h" +#include "write-behind-messages.h" -#define MAX_VECTOR_COUNT 8 +#define MAX_VECTOR_COUNT 8 #define WB_AGGREGATE_SIZE 131072 /* 128 KB */ -#define WB_WINDOW_SIZE 1048576 /* 1MB */ +#define WB_WINDOW_SIZE 1048576 /* 1MB */ typedef struct list_head list_head_t; struct wb_conf; -struct wb_page; -struct wb_file; - -typedef struct wb_file { - int disabled; - uint64_t disable_till; - size_t window_conf; - size_t window_current; - int32_t flags; - size_t aggregate_current; - int32_t refcount; - int32_t op_ret; - int32_t op_errno; - list_head_t request; - list_head_t passive_requests; - fd_t *fd; - gf_lock_t lock; - xlator_t *this; -}wb_file_t; +struct wb_inode; + +typedef struct wb_inode { + ssize_t window_conf; + ssize_t window_current; + ssize_t transit; /* size of data stack_wound, and yet + to be fulfilled (wb_fulfill_cbk). + used for trickling_writes + */ + + list_head_t all; /* All requests, from enqueue() till destroy(). + Used only for resetting generation + number when empty. + */ + list_head_t todo; /* Work to do (i.e, STACK_WIND to server). + Once we STACK_WIND, the entry is taken + off the list. If it is non-sync write, + then we continue to track it via @liability + or @temptation depending on the status + of its writeback. + */ + list_head_t liability; /* Non-sync writes which are lied + (STACK_UNWIND'ed to caller) but ack + from server not yet complete. This + is the "liability" which we hold, and + must guarantee that dependent operations + which arrive later (which overlap, etc.) + are issued only after their dependencies + in this list are "fulfilled". + + Server acks for entries in this list + shrinks the window. + + The sum total of all req->write_size + of entries in this list must be kept less + than the permitted window size. + */ + list_head_t temptation; /* Operations for which we are tempted + to 'lie' (write-behind), but temporarily + holding off (because of insufficient + window capacity, etc.) + + This is the list to look at to grow + the window (in __wb_pick_unwinds()). + + Entries typically get chosen from + write-behind from this list, and therefore + get "upgraded" to the "liability" list. + */ + list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC + which are currently STACK_WIND'ed towards the server. + This is for guaranteeing that no two overlapping + writes are in progress at the same time. Modules + like eager-lock in AFR depend on this behavior. + */ + list_head_t invalidate_list; /* list of wb_inodes that were marked for + * iatt invalidation due to requests in + * liability queue fulfilled while there + * was a readdirp session on parent + * directory. For a directory inode, this + * list points to list of children. + */ + uint64_t gen; /* Liability generation number. Represents + the current 'state' of liability. Every + new addition to the liability list bumps + the generation number. + + a newly arrived request is only required + to perform causal checks against the entries + in the liability list which were present + at the time of its addition. the generation + number at the time of its addition is stored + in the request and used during checks. + + the liability list can grow while the request + waits in the todo list waiting for its + dependent operations to complete. however + it is not of the request's concern to depend + itself on those new entries which arrived + after it arrived (i.e, those that have a + liability generation higher than itself) + */ + size_t size; /* Size of the file to catch write after EOF. */ + gf_lock_t lock; + xlator_t *this; + inode_t *inode; + int dontsync; /* If positive, don't pick lies for + * winding. This is needed to break infinite + * recursion during invocation of + * wb_process_queue from + * wb_fulfill_cbk in case of an + * error during fulfill. + */ + gf_atomic_int32_t readdirps; + gf_atomic_int8_t invalidate; + +} wb_inode_t; typedef struct wb_request { - list_head_t list; - list_head_t winds; - list_head_t unwinds; - list_head_t other_requests; - call_stub_t *stub; - size_t write_size; - int32_t refcount; - wb_file_t *file; - glusterfs_fop_t fop; - union { - struct { - char write_behind; - char stack_wound; - char got_reply; - char virgin; - char flush_all; /* while trying to sync to back-end, - * don't wait till a data of size - * equal to configured aggregate-size - * is accumulated, instead sync - * whatever data currently present in - * request queue. - */ - - }write_request; - - struct { - char marked_for_resume; - }other_requests; - }flags; + list_head_t all; + list_head_t todo; + list_head_t lie; /* either in @liability or @temptation */ + list_head_t winds; + list_head_t unwinds; + list_head_t wip; + + call_stub_t *stub; + + ssize_t write_size; /* currently held size + (after collapsing) */ + size_t orig_size; /* size which arrived with the request. + This is the size by which we grow + the window when unwinding the frame. + */ + size_t total_size; /* valid only in @head in wb_fulfill(). + This is the size with which we perform + STACK_WIND to server and therefore the + amount by which we shrink the window. + */ + + int op_ret; + int op_errno; + + int32_t refcount; + wb_inode_t *wb_inode; + glusterfs_fop_t fop; + gf_lkowner_t lk_owner; + pid_t client_pid; + struct iobref *iobref; + uint64_t gen; /* inode liability state at the time of + request arrival */ + + fd_t *fd; + int wind_count; /* number of sync-attempts. Only + for debug purposes */ + struct { + size_t size; /* 0 size == till infinity */ + off_t off; + int append : 1; /* offset is invalid. only one + outstanding append at a time */ + int tempted : 1; /* true only for non-sync writes */ + int lied : 1; /* sin committed */ + int fulfilled : 1; /* got server acknowledgement */ + int go : 1; /* enough aggregating, good to go */ + } ordering; + + /* for debug purposes. A request might outlive the fop it is + * representing. So, preserve essential info for logging. + */ + uint64_t unique; + uuid_t gfid; } wb_request_t; -struct wb_conf { - uint64_t aggregate_size; - uint64_t window_size; - uint64_t disable_till; - gf_boolean_t enable_O_SYNC; - gf_boolean_t flush_behind; - gf_boolean_t enable_trickling_writes; -}; +typedef struct wb_conf { + uint64_t aggregate_size; + uint64_t page_size; + uint64_t window_size; + gf_boolean_t flush_behind; + gf_boolean_t trickling_writes; + gf_boolean_t strict_write_ordering; + gf_boolean_t strict_O_DIRECT; + gf_boolean_t resync_after_fsync; +} wb_conf_t; + +wb_inode_t * +__wb_inode_ctx_get(xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + wb_inode_t *wb_inode = NULL; + int ret = 0; -typedef struct wb_local { - list_head_t winds; - int32_t flags; - int32_t wbflags; - struct wb_file *file; - wb_request_t *request; - int op_ret; - int op_errno; - call_frame_t *frame; - int32_t reply_count; -} wb_local_t; - -typedef struct wb_conf wb_conf_t; -typedef struct wb_page wb_page_t; + ret = __inode_ctx_get(inode, this, &value); + if (ret) + return NULL; -int32_t -wb_process_queue (call_frame_t *frame, wb_file_t *file); + wb_inode = (wb_inode_t *)(unsigned long)value; -ssize_t -wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds); + return wb_inode; +} -ssize_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size, - char enable_trickling_writes); +wb_inode_t * +wb_inode_ctx_get(xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; + GF_VALIDATE_OR_GOTO("write-behind", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); -static int -__wb_request_unref (wb_request_t *this) + LOCK(&inode->lock); + { + wb_inode = __wb_inode_ctx_get(this, inode); + } + UNLOCK(&inode->lock); +out: + return wb_inode; +} + +static void +wb_set_invalidate(wb_inode_t *wb_inode) { - int ret = -1; + int readdirps = 0; + inode_t *parent_inode = NULL; + wb_inode_t *wb_parent_inode = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + parent_inode = inode_parent(wb_inode->inode, NULL, NULL); + if (parent_inode) + wb_parent_inode = wb_inode_ctx_get(wb_inode->this, parent_inode); - if (this->refcount <= 0) { - gf_log ("wb-request", GF_LOG_WARNING, - "refcount(%d) is <= 0", this->refcount); - goto out; - } + if (wb_parent_inode) { + LOCK(&wb_parent_inode->lock); + { + readdirps = GF_ATOMIC_GET(wb_parent_inode->readdirps); + if (readdirps && list_empty(&wb_inode->invalidate_list)) { + inode_ref(wb_inode->inode); + GF_ATOMIC_INIT(wb_inode->invalidate, 1); + list_add(&wb_inode->invalidate_list, + &wb_parent_inode->invalidate_list); + } + } + UNLOCK(&wb_parent_inode->lock); + } else { + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + } + + if (parent_inode) + inode_unref(parent_inode); + + return; +} - ret = --this->refcount; - if (this->refcount == 0) { - list_del_init (&this->list); - if (this->stub && this->stub->fop == GF_FOP_WRITE) { - call_stub_destroy (this->stub); - } +void +wb_process_queue(wb_inode_t *wb_inode); - GF_FREE (this); - } +/* + Below is a succinct explanation of the code deciding whether two regions + overlap, from Pavan <tcp@gluster.com>. -out: - return ret; -} + For any two ranges to be non-overlapping, either the end of the first + range is lesser than the start of the second, or vice versa. Example - + <---------> <--------------> + p q x y -static int -wb_request_unref (wb_request_t *this) -{ - wb_file_t *file = NULL; - int ret = -1; + ( q < x ) or (y < p) = > No overlap. - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + To check for *overlap*, we can negate this (using de morgan's laws), and + it becomes - - file = this->file; + (q >= x ) and (y >= p) - LOCK (&file->lock); - { - ret = __wb_request_unref (this); - } - UNLOCK (&file->lock); + Either that, or you write the negation using - -out: - return ret; -} + if (! ((q < x) or (y < p)) ) { + "Overlap" + } +*/ +gf_boolean_t +wb_requests_overlap(wb_request_t *req1, wb_request_t *req2) +{ + uint64_t r1_start = 0; + uint64_t r1_end = 0; + uint64_t r2_start = 0; + uint64_t r2_end = 0; + gf_boolean_t do_overlap = _gf_false; + + r1_start = req1->ordering.off; + if (req1->ordering.size) + r1_end = r1_start + req1->ordering.size - 1; + else + r1_end = ULLONG_MAX; + + r2_start = req2->ordering.off; + if (req2->ordering.size) + r2_end = r2_start + req2->ordering.size - 1; + else + r2_end = ULLONG_MAX; + + do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start)); + + return do_overlap; +} -static wb_request_t * -__wb_request_ref (wb_request_t *this) +gf_boolean_t +wb_requests_conflict(wb_request_t *lie, wb_request_t *req) { - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + wb_conf_t *conf = NULL; - if (this->refcount < 0) { - gf_log ("wb-request", GF_LOG_WARNING, - "refcount(%d) is < 0", this->refcount); - this = NULL; - goto out; - } + conf = req->wb_inode->this->private; - this->refcount++; + if (lie == req) + /* request cannot conflict with itself */ + return _gf_false; -out: - return this; + if (lie->gen >= req->gen) + /* this liability entry was behind + us in the todo list */ + return _gf_false; + + if (lie->ordering.append) + /* all modifications wait for the completion + of outstanding append */ + return _gf_true; + + if (conf->strict_write_ordering) + /* We are sure (lie->gen < req->gen) by now. So + skip overlap check if strict write ordering is + requested and always return "conflict" against a + lower generation lie. */ + return _gf_true; + + return wb_requests_overlap(lie, req); } +wb_request_t * +wb_liability_has_conflict(wb_inode_t *wb_inode, wb_request_t *req) +{ + wb_request_t *each = NULL; + + list_for_each_entry(each, &wb_inode->liability, lie) + { + if (wb_requests_conflict(each, req) && (!each->ordering.fulfilled)) + /* A fulfilled request shouldn't block another + * request (even a dependent one) from winding. + */ + return each; + } + + return NULL; +} wb_request_t * -wb_request_ref (wb_request_t *this) +wb_wip_has_conflict(wb_inode_t *wb_inode, wb_request_t *req) { - wb_file_t *file = NULL; + wb_request_t *each = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + if (req->stub->fop != GF_FOP_WRITE) + /* non-writes fundamentally never conflict with WIP requests */ + return NULL; - file = this->file; - LOCK (&file->lock); - { - this = __wb_request_ref (this); - } - UNLOCK (&file->lock); + list_for_each_entry(each, &wb_inode->wip, wip) + { + if (each == req) + /* request never conflicts with itself, + though this condition should never occur. + */ + continue; -out: - return this; -} + if (wb_requests_overlap(each, req)) + return each; + } + return NULL; +} -wb_request_t * -wb_enqueue (wb_file_t *file, call_stub_t *stub) +static int +__wb_request_unref(wb_request_t *req) { - wb_request_t *request = NULL, *tmp = NULL; - call_frame_t *frame = NULL; - wb_local_t *local = NULL; - struct iovec *vector = NULL; - int32_t count = 0; - - GF_VALIDATE_OR_GOTO ("write-behind", file, out); - GF_VALIDATE_OR_GOTO (file->this->name, stub, out); + int ret = -1; + wb_inode_t *wb_inode = NULL; + char gfid[64] = { + 0, + }; - request = GF_CALLOC (1, sizeof (*request), gf_wb_mt_wb_request_t); - if (request == NULL) { - goto out; - } + wb_inode = req->wb_inode; - INIT_LIST_HEAD (&request->list); - INIT_LIST_HEAD (&request->winds); - INIT_LIST_HEAD (&request->unwinds); - INIT_LIST_HEAD (&request->other_requests); + if (req->refcount <= 0) { + uuid_utoa_r(req->gfid, gfid); - request->stub = stub; - request->file = file; - request->fop = stub->fop; + gf_msg( + "wb-request", GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_RES_UNAVAILABLE, + "(unique=%" PRIu64 ", fop=%s, gfid=%s, gen=%" PRIu64 + "): " + "refcount(%d) is <= 0 ", + req->unique, gf_fop_list[req->fop], gfid, req->gen, req->refcount); + goto out; + } - frame = stub->frame; - local = frame->local; - if (local) { - local->request = request; - } + ret = --req->refcount; + if (req->refcount == 0) { + uuid_utoa_r(req->gfid, gfid); - if (stub->fop == GF_FOP_WRITE) { - vector = stub->args.writev.vector; - count = stub->args.writev.count; + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "(unique = %" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): destroying request, " + "removing from all queues", + req->unique, gf_fop_list[req->fop], gfid, req->gen); - request->write_size = iov_length (vector, count); - if (local) { - local->op_ret = request->write_size; - local->op_errno = 0; - } + list_del_init(&req->todo); + list_del_init(&req->lie); + list_del_init(&req->wip); - request->flags.write_request.virgin = 1; + list_del_init(&req->all); + if (list_empty(&wb_inode->all)) { + wb_inode->gen = 0; + /* in case of accounting errors? */ + wb_inode->window_current = 0; } - LOCK (&file->lock); - { - list_add_tail (&request->list, &file->request); - if (stub->fop == GF_FOP_WRITE) { - /* reference for stack winding */ - __wb_request_ref (request); - - /* reference for stack unwinding */ - __wb_request_ref (request); + list_del_init(&req->winds); + list_del_init(&req->unwinds); - file->aggregate_current += request->write_size; - } else { - list_for_each_entry (tmp, &file->request, list) { - if (tmp->stub && tmp->stub->fop - == GF_FOP_WRITE) { - tmp->flags.write_request.flush_all = 1; - } - } - - /*reference for resuming */ - __wb_request_ref (request); - } + if (req->stub) { + call_stub_destroy(req->stub); + req->stub = NULL; } - UNLOCK (&file->lock); + if (req->iobref) + iobref_unref(req->iobref); + + if (req->fd) + fd_unref(req->fd); + + GF_FREE(req); + } out: - return request; + return ret; } - -wb_file_t * -wb_file_create (xlator_t *this, fd_t *fd, int32_t flags) +static int +wb_request_unref(wb_request_t *req) { - wb_file_t *file = NULL; - wb_conf_t *conf = NULL; + wb_inode_t *wb_inode = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO("write-behind", req, out); - conf = this->private; + wb_inode = req->wb_inode; - file = GF_CALLOC (1, sizeof (*file), gf_wb_mt_wb_file_t); - if (file == NULL) { - goto out; - } + LOCK(&wb_inode->lock); + { + ret = __wb_request_unref(req); + } + UNLOCK(&wb_inode->lock); - INIT_LIST_HEAD (&file->request); - INIT_LIST_HEAD (&file->passive_requests); +out: + return ret; +} - /* - fd_ref() not required, file should never decide the existence of - an fd - */ - file->fd= fd; - file->disable_till = conf->disable_till; - file->this = this; - file->refcount = 1; - file->window_conf = conf->window_size; - file->flags = flags; +static wb_request_t * +__wb_request_ref(wb_request_t *req) +{ + GF_VALIDATE_OR_GOTO("write-behind", req, out); - LOCK_INIT (&file->lock); + if (req->refcount < 0) { + gf_msg("wb-request", GF_LOG_WARNING, 0, + WRITE_BEHIND_MSG_RES_UNAVAILABLE, "refcount(%d) is < 0", + req->refcount); + req = NULL; + goto out; + } - fd_ctx_set (fd, this, (uint64_t)(long)file); + req->refcount++; out: - return file; + return req; } - -void -wb_file_destroy (wb_file_t *file) +wb_request_t * +wb_request_ref(wb_request_t *req) { - int32_t refcount = 0; - - GF_VALIDATE_OR_GOTO ("write-behind", file, out); + wb_inode_t *wb_inode = NULL; - LOCK (&file->lock); - { - refcount = --file->refcount; - } - UNLOCK (&file->lock); + GF_VALIDATE_OR_GOTO("write-behind", req, out); - if (!refcount){ - LOCK_DESTROY (&file->lock); - GF_FREE (file); - } + wb_inode = req->wb_inode; + LOCK(&wb_inode->lock); + { + req = __wb_request_ref(req); + } + UNLOCK(&wb_inode->lock); out: - return; + return req; } - -int32_t -wb_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf) +gf_boolean_t +wb_enqueue_common(wb_inode_t *wb_inode, call_stub_t *stub, int tempted) { - wb_local_t *local = NULL; - list_head_t *winds = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL, *dummy = NULL; - wb_local_t *per_request_local = NULL; - int32_t ret = -1; - fd_t *fd = NULL; + wb_request_t *req = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out); + GF_VALIDATE_OR_GOTO(wb_inode->this->name, stub, out); + + req = GF_CALLOC(1, sizeof(*req), gf_wb_mt_wb_request_t); + if (!req) + goto out; + + INIT_LIST_HEAD(&req->all); + INIT_LIST_HEAD(&req->todo); + INIT_LIST_HEAD(&req->lie); + INIT_LIST_HEAD(&req->winds); + INIT_LIST_HEAD(&req->unwinds); + INIT_LIST_HEAD(&req->wip); + + req->stub = stub; + req->wb_inode = wb_inode; + req->fop = stub->fop; + req->ordering.tempted = tempted; + req->unique = stub->frame->root->unique; + + inode = ((stub->args.fd != NULL) ? stub->args.fd->inode + : stub->args.loc.inode); + + if (inode) + gf_uuid_copy(req->gfid, inode->gfid); + + if (stub->fop == GF_FOP_WRITE) { + req->write_size = iov_length(stub->args.vector, stub->args.count); + + /* req->write_size can change as we collapse + small writes. But the window needs to grow + only by how much we acknowledge the app. so + copy the original size in orig_size for the + purpose of accounting. + */ + req->orig_size = req->write_size; - GF_ASSERT (frame); - GF_ASSERT (this); + /* Let's be optimistic that we can + lie about it + */ + req->op_ret = req->write_size; + req->op_errno = 0; + + if (stub->args.fd && (stub->args.fd->flags & O_APPEND)) + req->ordering.append = 1; + } + + req->lk_owner = stub->frame->root->lk_owner; + req->client_pid = stub->frame->root->pid; + + switch (stub->fop) { + case GF_FOP_WRITE: + LOCK(&wb_inode->lock); + { + if (wb_inode->size < stub->args.offset) { + req->ordering.off = wb_inode->size; + req->ordering.size = stub->args.offset + req->write_size - + wb_inode->size; + } else { + req->ordering.off = stub->args.offset; + req->ordering.size = req->write_size; + } - local = frame->local; - winds = &local->winds; + if (wb_inode->size < stub->args.offset + req->write_size) + wb_inode->size = stub->args.offset + req->write_size; + } + UNLOCK(&wb_inode->lock); - file = local->file; - GF_VALIDATE_OR_GOTO (this->name, file, out); + req->fd = fd_ref(stub->args.fd); - LOCK (&file->lock); - { - list_for_each_entry_safe (request, dummy, winds, winds) { - request->flags.write_request.got_reply = 1; + break; + case GF_FOP_READ: + req->ordering.off = stub->args.offset; + req->ordering.size = stub->args.size; - if (!request->flags.write_request.write_behind - && (op_ret == -1)) { - per_request_local = request->stub->frame->local; - per_request_local->op_ret = op_ret; - per_request_local->op_errno = op_errno; - } + req->fd = fd_ref(stub->args.fd); - if (request->flags.write_request.write_behind) { - file->window_current -= request->write_size; - } + break; + case GF_FOP_TRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + LOCK(&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK(&wb_inode->lock); + break; + case GF_FOP_FTRUNCATE: + req->ordering.off = stub->args.offset; + req->ordering.size = 0; /* till infinity */ + LOCK(&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK(&wb_inode->lock); - __wb_request_unref (request); - } + req->fd = fd_ref(stub->args.fd); - if (op_ret == -1) { - file->op_ret = op_ret; - file->op_errno = op_errno; - } - fd = file->fd; - } - UNLOCK (&file->lock); - - ret = wb_process_queue (frame, file); - if (ret == -1) { - if (errno == ENOMEM) { - LOCK (&file->lock); - { - file->op_ret = -1; - file->op_errno = ENOMEM; - } - UNLOCK (&file->lock); - } + break; + default: + if (stub && stub->args.fd) + req->fd = fd_ref(stub->args.fd); - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + break; + } - /* safe place to do fd_unref */ - fd_unref (fd); + LOCK(&wb_inode->lock); + { + list_add_tail(&req->all, &wb_inode->all); - STACK_DESTROY (frame->root); + req->gen = wb_inode->gen; -out: - return 0; -} - - -ssize_t -wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds) -{ - wb_request_t *dummy = NULL, *request = NULL; - wb_request_t *first_request = NULL, *next = NULL; - size_t total_count = 0, count = 0; - size_t copied = 0; - call_frame_t *sync_frame = NULL; - struct iobref *iobref = NULL; - wb_local_t *local = NULL; - struct iovec *vector = NULL; - ssize_t current_size = 0, bytes = 0; - size_t bytecount = 0; - wb_conf_t *conf = NULL; - fd_t *fd = NULL; - int32_t op_errno = -1; - - GF_VALIDATE_OR_GOTO_WITH_ERROR ((file ? file->this->name - : "write-behind"), frame, - out, bytes, -1); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, file, out, bytes, - -1); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, winds, out, bytes, - -1); - - conf = file->this->private; - list_for_each_entry (request, winds, winds) { - total_count += request->stub->args.writev.count; - if (total_count > 0) { - break; - } - } + list_add_tail(&req->todo, &wb_inode->todo); + __wb_request_ref(req); /* for wind */ - if (total_count == 0) { - gf_log (file->this->name, GF_LOG_TRACE, "no vectors are to be" - "synced"); - goto out; + if (req->ordering.tempted) { + list_add_tail(&req->lie, &wb_inode->temptation); + __wb_request_ref(req); /* for unwind */ } + } + UNLOCK(&wb_inode->lock); - list_for_each_entry_safe (request, dummy, winds, winds) { - if (!vector) { - vector = GF_MALLOC (VECTORSIZE (MAX_VECTOR_COUNT), - gf_wb_mt_iovec); - if (vector == NULL) { - bytes = -1; - op_errno = ENOMEM; - goto out; - } - - iobref = iobref_new (); - if (iobref == NULL) { - bytes = -1; - op_errno = ENOMEM; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), - gf_wb_mt_wb_local_t); - if (local == NULL) { - bytes = -1; - op_errno = ENOMEM; - goto out; - } - - INIT_LIST_HEAD (&local->winds); - - first_request = request; - current_size = 0; - } +out: + if (!req) + return _gf_false; - count += request->stub->args.writev.count; - bytecount = VECTORSIZE (request->stub->args.writev.count); - memcpy (((char *)vector)+copied, - request->stub->args.writev.vector, - bytecount); - copied += bytecount; + return _gf_true; +} - current_size += request->write_size; +gf_boolean_t +wb_enqueue(wb_inode_t *wb_inode, call_stub_t *stub) +{ + return wb_enqueue_common(wb_inode, stub, 0); +} - if (request->stub->args.writev.iobref) { - iobref_merge (iobref, - request->stub->args.writev.iobref); - } +gf_boolean_t +wb_enqueue_tempted(wb_inode_t *wb_inode, call_stub_t *stub) +{ + return wb_enqueue_common(wb_inode, stub, 1); +} - next = NULL; - if (request->winds.next != winds) { - next = list_entry (request->winds.next, - wb_request_t, winds); - } +wb_inode_t * +__wb_inode_create(xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + int ret = 0; - list_del_init (&request->winds); - list_add_tail (&request->winds, &local->winds); - - if ((!next) - || ((count + next->stub->args.writev.count) - > MAX_VECTOR_COUNT) - || ((current_size + next->write_size) - > conf->aggregate_size)) { - - sync_frame = copy_frame (frame); - if (sync_frame == NULL) { - bytes = -1; - op_errno = ENOMEM; - goto out; - } - - sync_frame->local = local; - local->file = file; - - LOCK (&file->lock); - { - fd = file->fd; - } - UNLOCK (&file->lock); - - fd_ref (fd); - - bytes += current_size; - STACK_WIND (sync_frame, wb_sync_cbk, - FIRST_CHILD(sync_frame->this), - FIRST_CHILD(sync_frame->this)->fops->writev, - fd, vector, count, - first_request->stub->args.writev.off, - iobref); - - iobref_unref (iobref); - GF_FREE (vector); - first_request = NULL; - iobref = NULL; - vector = NULL; - sync_frame = NULL; - local = NULL; - copied = count = 0; - } - } + GF_VALIDATE_OR_GOTO(this->name, inode, out); -out: - if (sync_frame != NULL) { - sync_frame->local = NULL; - STACK_DESTROY (sync_frame->root); - } + conf = this->private; - if (local != NULL) { - /* had we winded these requests, we would have unrefed - * in wb_sync_cbk. - */ - list_for_each_entry_safe (request, dummy, &local->winds, - winds) { - wb_request_unref (request); - } + wb_inode = GF_CALLOC(1, sizeof(*wb_inode), gf_wb_mt_wb_inode_t); + if (!wb_inode) + goto out; - GF_FREE (local); - local = NULL; - } + INIT_LIST_HEAD(&wb_inode->all); + INIT_LIST_HEAD(&wb_inode->todo); + INIT_LIST_HEAD(&wb_inode->liability); + INIT_LIST_HEAD(&wb_inode->temptation); + INIT_LIST_HEAD(&wb_inode->wip); + INIT_LIST_HEAD(&wb_inode->invalidate_list); - if (iobref != NULL) { - iobref_unref (iobref); - } + wb_inode->this = this; - if (vector != NULL) { - GF_FREE (vector); - } + wb_inode->window_conf = conf->window_size; + wb_inode->inode = inode; - if (bytes == -1) { - /* - * had we winded these requests, we would have unrefed - * in wb_sync_cbk. - */ - if (local) { - list_for_each_entry_safe (request, dummy, &local->winds, - winds) { - wb_request_unref (request); - } - } + LOCK_INIT(&wb_inode->lock); + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + GF_ATOMIC_INIT(wb_inode->readdirps, 0); - if (file != NULL) { - LOCK (&file->lock); - { - file->op_ret = -1; - file->op_errno = op_errno; - } - UNLOCK (&file->lock); - } - } + ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)wb_inode); + if (ret) { + GF_FREE(wb_inode); + wb_inode = NULL; + } - return bytes; +out: + return wb_inode; } +wb_inode_t * +wb_inode_create(xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; -int32_t -wb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *buf) -{ - wb_local_t *local = NULL; - wb_request_t *request = NULL; - call_frame_t *process_frame = NULL; - wb_file_t *file = NULL; - int32_t ret = -1; - fd_t *fd = NULL; - - GF_ASSERT (frame); - GF_ASSERT (this); - - local = frame->local; - file = local->file; - - request = local->request; - if (request) { - process_frame = copy_frame (frame); - if (process_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - } - } - - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf); - - if (request != NULL) { - wb_request_unref (request); - } + GF_VALIDATE_OR_GOTO(this->name, inode, out); - if (process_frame != NULL) { - ret = wb_process_queue (process_frame, file); - if (ret == -1) { - if ((errno == ENOMEM) && (file != NULL)) { - LOCK (&file->lock); - { - file->op_ret = -1; - file->op_errno = ENOMEM; - } - UNLOCK (&file->lock); - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + LOCK(&inode->lock); + { + wb_inode = __wb_inode_ctx_get(this, inode); + if (!wb_inode) + wb_inode = __wb_inode_create(this, inode); + } + UNLOCK(&inode->lock); - STACK_DESTROY (process_frame->root); - } +out: + return wb_inode; +} - if (file) { - LOCK (&file->lock); - { - fd = file->fd; - } - UNLOCK (&file->lock); +void +wb_inode_destroy(wb_inode_t *wb_inode) +{ + GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out); - fd_unref (fd); - } + GF_ASSERT(list_empty(&wb_inode->todo)); + GF_ASSERT(list_empty(&wb_inode->liability)); + GF_ASSERT(list_empty(&wb_inode->temptation)); - return 0; + LOCK_DESTROY(&wb_inode->lock); + GF_FREE(wb_inode); +out: + return; } - -static int32_t -wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc) +void +__wb_fulfill_request(wb_request_t *req) { - GF_ASSERT (frame); - GF_ASSERT (this); + wb_inode_t *wb_inode = NULL; + char gfid[64] = { + 0, + }; + + wb_inode = req->wb_inode; + + req->ordering.fulfilled = 1; + wb_inode->window_current -= req->total_size; + wb_inode->transit -= req->total_size; + + uuid_utoa_r(req->gfid, gfid); + + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): request fulfilled. " + "removing the request from liability queue? = %s", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + req->ordering.lied ? "yes" : "no"); + + if (req->ordering.lied) { + /* 1. If yes, request is in liability queue and hence can be + safely removed from list. + 2. If no, request is in temptation queue and hence should be + left in the queue so that wb_pick_unwinds picks it up + */ + list_del_init(&req->lie); + } else { + /* TODO: fail the req->frame with error if + necessary + */ + } - STACK_WIND (frame, wb_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc); - return 0; + list_del_init(&req->wip); + __wb_request_unref(req); } +/* get a flush/fsync waiting on req */ +wb_request_t * +__wb_request_waiting_on(wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *trav = NULL; -int32_t -wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - - if (loc->inode) { - /* FIXME: fd_lookup extends life of fd till stat returns */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)) { - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - iter_fd = NULL; - } - } - } + wb_inode = req->wb_inode; - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } + list_for_each_entry(trav, &wb_inode->todo, todo) + { + if (((trav->stub->fop == GF_FOP_FLUSH) || + (trav->stub->fop == GF_FOP_FSYNC)) && + (trav->gen >= req->gen)) + return trav; + } - local->file = file; + return NULL; +} - frame->local = local; +void +__wb_add_request_for_retry(wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; - if (file) { - stub = fop_stat_stub (frame, wb_stat_helper, loc); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if (!req) + goto out; - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } + wb_inode = req->wb_inode; - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_stat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->stat, loc); - } + /* response was unwound and no waiter waiting on this request, retry + till a flush or fsync (subject to conf->resync_after_fsync). + */ + wb_inode->transit -= req->total_size; - return 0; -unwind: - STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL); + req->total_size = 0; - if (stub) { - call_stub_destroy (stub); - } + list_del_init(&req->winds); + list_del_init(&req->todo); + list_del_init(&req->wip); - if (iter_fd != NULL) { - fd_unref (iter_fd); - } + /* sanitize ordering flags to retry */ + req->ordering.go = 0; - return 0; + /* Add back to todo list to retry */ + list_add(&req->todo, &wb_inode->todo); + +out: + return; } +void +__wb_add_head_for_retry(wb_request_t *head) +{ + wb_request_t *req = NULL, *tmp = NULL; -int32_t -wb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *buf) -{ - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - int32_t ret = -1; - - GF_ASSERT (frame); - - local = frame->local; - file = local->file; - - request = local->request; - if ((file != NULL) && (request != NULL)) { - wb_request_unref (request); - ret = wb_process_queue (frame, file); - if (ret == -1) { - if (errno == ENOMEM) { - op_ret = -1; - op_errno = ENOMEM; - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } + if (!head) + goto out; - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf); + list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds) + { + __wb_add_request_for_retry(req); + } - return 0; -} + __wb_add_request_for_retry(head); +out: + return; +} -int32_t -wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) +void +wb_add_head_for_retry(wb_request_t *head) { - GF_ASSERT (frame); - GF_ASSERT (this); + if (!head) + goto out; - STACK_WIND (frame, wb_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd); - return 0; -} + LOCK(&head->wb_inode->lock); + { + __wb_add_head_for_retry(head); + } + UNLOCK(&head->wb_inode->lock); +out: + return; +} -int32_t -wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +void +__wb_fulfill_request_err(wb_request_t *req, int32_t op_errno) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1; - int op_errno = EINVAL; + wb_inode_t *wb_inode = NULL; + wb_request_t *waiter = NULL; + wb_conf_t *conf = NULL; + + wb_inode = req->wb_inode; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + conf = wb_inode->this->private; + req->op_ret = -1; + req->op_errno = op_errno; - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); + if (req->ordering.lied) + waiter = __wb_request_waiting_on(req); + + if (!req->ordering.lied || waiter) { + if (!req->ordering.lied) { + /* response to app is still pending, send failure in + * response. + */ } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } + /* response was sent, store the error in a + * waiter (either an fsync or flush). + */ + waiter->op_ret = -1; + waiter->op_errno = op_errno; } - local = GF_CALLOC (1, sizeof (*local), - gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; + if (!req->ordering.lied || (waiter->stub->fop == GF_FOP_FLUSH) || + ((waiter->stub->fop == GF_FOP_FSYNC) && + !conf->resync_after_fsync)) { + /* No retry needed, forget the request */ + __wb_fulfill_request(req); + return; } + } - local->file = file; - - frame->local = local; + __wb_add_request_for_retry(req); - if (file) { - stub = fop_fstat_stub (frame, wb_fstat_helper, fd); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } - - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } - - /* - FIXME:should the request queue be emptied in case of error? - */ - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_fstat_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fstat, fd); - } + return; +} - return 0; +void +wb_head_done(wb_request_t *head) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_inode_t *wb_inode = NULL; -unwind: - STACK_UNWIND_STRICT (fstat, frame, -1, op_errno, NULL); + wb_inode = head->wb_inode; - if (stub) { - call_stub_destroy (stub); + LOCK(&wb_inode->lock); + { + list_for_each_entry_safe(req, tmp, &head->winds, winds) + { + __wb_fulfill_request(req); } - return 0; + __wb_fulfill_request(head); + } + UNLOCK(&wb_inode->lock); } +void +__wb_fulfill_err(wb_request_t *head, int op_errno) +{ + wb_request_t *req = NULL, *tmp = NULL; -int32_t -wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - call_frame_t *process_frame = NULL; - int32_t ret = -1; - fd_t *fd = NULL; - - GF_ASSERT (frame); - - local = frame->local; - file = local->file; - request = local->request; - - if ((request != NULL) && (file != NULL)) { - process_frame = copy_frame (frame); - if (process_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - } - } + if (!head) + goto out; - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf); + head->wb_inode->dontsync++; - if (request) { - wb_request_unref (request); - } + list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds) + { + __wb_fulfill_request_err(req, op_errno); + } - if (process_frame != NULL) { - ret = wb_process_queue (process_frame, file); - if (ret == -1) { - if ((errno == ENOMEM) && (file != NULL)) { - LOCK (&file->lock); - { - file->op_ret = -1; - file->op_errno = ENOMEM; - } - UNLOCK (&file->lock); - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + __wb_fulfill_request_err(head, op_errno); - STACK_DESTROY (process_frame->root); - } +out: + return; +} - if (file) { - LOCK (&file->lock); - { - fd = file->fd; - } - UNLOCK (&file->lock); +void +wb_fulfill_err(wb_request_t *head, int op_errno) +{ + wb_inode_t *wb_inode = NULL; - fd_unref (fd); - } + wb_inode = head->wb_inode; - return 0; + LOCK(&wb_inode->lock); + { + __wb_fulfill_err(head, op_errno); + } + UNLOCK(&wb_inode->lock); } - -static int32_t -wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) +void +__wb_modify_write_request(wb_request_t *req, int synced_size) { - GF_ASSERT (frame); - GF_ASSERT (this); + struct iovec *vector = NULL; + int count = 0; - STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); + if (!req || synced_size == 0) + goto out; - return 0; + req->write_size -= synced_size; + req->stub->args.offset += synced_size; + + vector = req->stub->args.vector; + count = req->stub->args.count; + + req->stub->args.count = iov_skip(vector, count, synced_size); + +out: + return; } +int +__wb_fulfill_short_write(wb_request_t *req, int size, gf_boolean_t *fulfilled) +{ + int accounted_size = 0; -int32_t -wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) -{ - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - - if (loc->inode) { - /* - FIXME: fd_lookup extends life of fd till the execution of - truncate_cbk - */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)){ - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - } - } - } + if (req == NULL) + goto out; - local = GF_CALLOC (1, sizeof (*local), - gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if (req->write_size <= size) { + accounted_size = req->write_size; + __wb_fulfill_request(req); + *fulfilled = 1; + } else { + accounted_size = size; + __wb_modify_write_request(req, size); + *fulfilled = 0; + } - local->file = file; +out: + return accounted_size; +} - frame->local = local; - if (file) { - stub = fop_truncate_stub (frame, wb_truncate_helper, loc, - offset); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } +void +wb_fulfill_short_write(wb_request_t *head, int size) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *req = NULL, *next = NULL; + int accounted_size = 0; + gf_boolean_t fulfilled = _gf_false; + + if (!head) + goto out; + + wb_inode = head->wb_inode; + + req = head; + + LOCK(&wb_inode->lock); + { + /* hold a reference to head so that __wb_fulfill_short_write + * won't free it. We need head for a cleaner list traversal as + * list_for_each_entry_safe doesn't iterate over "head" member. + * So, if we pass "next->winds" as head to list_for_each_entry, + * "next" is skipped. For a simpler logic we need to traverse + * the list in the order. So, we start traversal from + * "head->winds" and hence we want head to be alive. + */ + __wb_request_ref(head); - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } + next = list_entry(head->winds.next, wb_request_t, winds); - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, loc, offset); - } + accounted_size = __wb_fulfill_short_write(head, size, &fulfilled); - return 0; + size -= accounted_size; -unwind: - STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL); + if (size == 0) { + if (fulfilled && (next != head)) + req = next; - if (stub) { - call_stub_destroy (stub); + goto done; } - return 0; -} - + list_for_each_entry_safe(req, next, &head->winds, winds) + { + accounted_size = __wb_fulfill_short_write(req, size, &fulfilled); + size -= accounted_size; -int32_t -wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - wb_local_t *local = NULL; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - int32_t ret = -1; - - GF_ASSERT (frame); - - local = frame->local; - file = local->file; - request = local->request; - - if ((request != NULL) && (file != NULL)) { - wb_request_unref (request); - ret = wb_process_queue (frame, file); - if (ret == -1) { - if (errno == ENOMEM) { - op_ret = -1; - op_errno = ENOMEM; - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + if (size == 0) { + if (fulfilled && (next != head)) + req = next; + break; + } } + done: + __wb_request_unref(head); + } + UNLOCK(&wb_inode->lock); - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, - postbuf); + wb_add_head_for_retry(req); +out: + return; +} - return 0; +int +wb_fulfill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *head = NULL; + + head = frame->local; + frame->local = NULL; + + wb_inode = head->wb_inode; + + /* There could be a readdirp session in progress. Since wb_fulfill_cbk + * can potentially remove a request from liability queue, + * wb_readdirp_cbk will miss writes on this inode (as it invalidates + * stats only if liability queue is not empty) and hence mark inode + * for invalidation of stats in readdirp response. Specifically this + * code fixes the following race mentioned in wb_readdirp_cbk: + */ + + /* <removed comment from wb_readdirp_cbk> + * We cannot guarantee integrity of entry->d_stat as there are cached + * writes. The stat is most likely stale as it doesn't account the + * cached writes. However, checking for non-empty liability list here is + * not a fool-proof solution as there can be races like, + * 1. readdirp is successful on posix + * 2. sync of cached write is successful on posix + * 3. write-behind received sync response and removed the request from + * liability queue + * 4. readdirp response is processed at write-behind + * + * In the above scenario, stat for the file is sent back in readdirp + * response but it is stale. + * </comment> */ + wb_set_invalidate(wb_inode); + + if (op_ret == -1) { + wb_fulfill_err(head, op_errno); + } else if (op_ret < head->total_size) { + wb_fulfill_short_write(head, op_ret); + } else { + wb_head_done(head); + } + + wb_process_queue(wb_inode); + + STACK_DESTROY(frame->root); + + return 0; } +#define WB_IOV_LOAD(vec, cnt, req, head) \ + do { \ + memcpy(&vec[cnt], req->stub->args.vector, \ + (req->stub->args.count * sizeof(vec[0]))); \ + cnt += req->stub->args.count; \ + head->total_size += req->write_size; \ + } while (0) -static int32_t -wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) +int +wb_fulfill_head(wb_inode_t *wb_inode, wb_request_t *head) { - GF_ASSERT (frame); - GF_ASSERT (this); - - STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); - return 0; + struct iovec vector[MAX_VECTOR_COUNT]; + int count = 0; + wb_request_t *req = NULL; + call_frame_t *frame = NULL; + + /* make sure head->total_size is updated before we run into any + * errors + */ + + WB_IOV_LOAD(vector, count, head, head); + + list_for_each_entry(req, &head->winds, winds) + { + WB_IOV_LOAD(vector, count, req, head); + + if (iobref_merge(head->stub->args.iobref, req->stub->args.iobref)) + goto err; + } + + frame = create_frame(wb_inode->this, wb_inode->this->ctx->pool); + if (!frame) + goto err; + + frame->root->lk_owner = head->lk_owner; + frame->root->pid = head->client_pid; + frame->local = head; + + LOCK(&wb_inode->lock); + { + wb_inode->transit += head->total_size; + } + UNLOCK(&wb_inode->lock); + + STACK_WIND(frame, wb_fulfill_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->writev, head->fd, vector, count, + head->stub->args.offset, head->stub->args.flags, + head->stub->args.iobref, NULL); + + return 0; +err: + /* frame creation failure */ + wb_fulfill_err(head, ENOMEM); + + return ENOMEM; } +#define NEXT_HEAD(head, req) \ + do { \ + if (head) \ + ret |= wb_fulfill_head(wb_inode, head); \ + head = req; \ + expected_offset = req->stub->args.offset + req->write_size; \ + curr_aggregate = 0; \ + vector_count = 0; \ + } while (0) -int32_t -wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) +int +wb_fulfill(wb_inode_t *wb_inode, list_head_t *liabilities) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1; - int op_errno = EINVAL; + wb_request_t *req = NULL; + wb_request_t *head = NULL; + wb_request_t *tmp = NULL; + wb_conf_t *conf = NULL; + off_t expected_offset = 0; + size_t curr_aggregate = 0; + size_t vector_count = 0; + int ret = 0; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + conf = wb_inode->this->private; + list_for_each_entry_safe(req, tmp, liabilities, winds) + { + list_del_init(&req->winds); - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); - } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } + if (!head) { + NEXT_HEAD(head, req); + continue; } - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; + if (req->fd != head->fd) { + NEXT_HEAD(head, req); + continue; } - local->file = file; - - frame->local = local; + if (!is_same_lkowner(&req->lk_owner, &head->lk_owner)) { + NEXT_HEAD(head, req); + continue; + } - if (file) { - stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, - offset); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if (expected_offset != req->stub->args.offset) { + NEXT_HEAD(head, req); + continue; + } - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if ((curr_aggregate + req->write_size) > conf->aggregate_size) { + NEXT_HEAD(head, req); + continue; + } - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ftruncate, fd, offset); + if (vector_count + req->stub->args.count > MAX_VECTOR_COUNT) { + NEXT_HEAD(head, req); + continue; } - return 0; + list_add_tail(&req->winds, &head->winds); + curr_aggregate += req->write_size; + vector_count += req->stub->args.count; + } -unwind: - STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL); + if (head) + ret |= wb_fulfill_head(wb_inode, head); - if (stub) { - call_stub_destroy (stub); - } + return ret; +} - return 0; +void +wb_do_unwinds(wb_inode_t *wb_inode, list_head_t *lies) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + call_frame_t *frame = NULL; + struct iatt buf = { + 0, + }; + + list_for_each_entry_safe(req, tmp, lies, unwinds) + { + frame = req->stub->frame; + + STACK_UNWIND_STRICT(writev, frame, req->op_ret, req->op_errno, &buf, + &buf, NULL); /* :O */ + req->stub->frame = NULL; + + list_del_init(&req->unwinds); + wb_request_unref(req); + } + + return; } +void +__wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + char gfid[64] = { + 0, + }; -int32_t -wb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - wb_local_t *local = NULL; - wb_request_t *request = NULL; - call_frame_t *process_frame = NULL; - wb_file_t *file = NULL; - int32_t ret = -1; - fd_t *fd = NULL; - - GF_ASSERT (frame); - - local = frame->local; - file = local->file; - request = local->request; - - if (request) { - process_frame = copy_frame (frame); - if (process_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - } - } + list_for_each_entry_safe(req, tmp, &wb_inode->temptation, lie) + { + if (!req->ordering.fulfilled && + wb_inode->window_current > wb_inode->window_conf) + continue; - STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre, - statpost); + list_del_init(&req->lie); + list_move_tail(&req->unwinds, lies); - if (request) { - wb_request_unref (request); - } + wb_inode->window_current += req->orig_size; - if (request && (process_frame != NULL)) { - ret = wb_process_queue (process_frame, file); - if (ret == -1) { - if ((errno == ENOMEM) && (file != NULL)) { - LOCK (&file->lock); - { - file->op_ret = -1; - file->op_errno = ENOMEM; - } - UNLOCK (&file->lock); - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + wb_inode->gen++; - STACK_DESTROY (process_frame->root); - } + if (!req->ordering.fulfilled) { + /* burden increased */ + list_add_tail(&req->lie, &wb_inode->liability); - if (file) { - LOCK (&file->lock); - { - fd = file->fd; - } - UNLOCK (&file->lock); + req->ordering.lied = 1; - fd_unref (fd); + uuid_utoa_r(req->gfid, gfid); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): added req to liability " + "queue. inode-generation-number=%" PRIu64, + req->stub->frame->root->unique, gf_fop_list[req->fop], + gfid, req->gen, wb_inode->gen); } + } - return 0; + return; } - -static int32_t -wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) +int +__wb_collapse_small_writes(wb_conf_t *conf, wb_request_t *holder, + wb_request_t *req) { - GF_ASSERT (frame); - GF_ASSERT (this); - - STACK_WIND (frame, wb_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid); - return 0; -} + char *ptr = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + int ret = -1; + ssize_t required_size = 0; + size_t holder_len = 0; + size_t req_len = 0; + if (!holder->iobref) { + holder_len = iov_length(holder->stub->args.vector, + holder->stub->args.count); + req_len = iov_length(req->stub->args.vector, req->stub->args.count); -int32_t -wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid) -{ - wb_file_t *file = NULL; - fd_t *iter_fd = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; + required_size = max((conf->page_size), (holder_len + req_len)); + iobuf = iobuf_get2(req->wb_inode->this->ctx->iobuf_pool, required_size); + if (iobuf == NULL) { + goto out; } - frame->local = local; + iobref = iobref_new(); + if (iobref == NULL) { + iobuf_unref(iobuf); + goto out; + } - if (!(valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME))) { - STACK_WIND (frame, wb_setattr_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->setattr, loc, stbuf, - valid); - goto out; + ret = iobref_add(iobref, iobuf); + if (ret != 0) { + gf_msg(req->wb_inode->this->name, GF_LOG_WARNING, -ret, + WRITE_BEHIND_MSG_INVALID_ARGUMENT, + "cannot add iobuf (%p) into iobref (%p)", iobuf, iobref); + iobuf_unref(iobuf); + iobref_unref(iobref); + goto out; } - if (loc->inode) { - /* - FIXME: fd_lookup extends life of fd till the execution - of wb_utimens_cbk - */ - iter_fd = fd_lookup (loc->inode, frame->root->pid); - if (iter_fd) { - if (!fd_ctx_get (iter_fd, this, &tmp_file)) { - file = (wb_file_t *)(long)tmp_file; - } else { - fd_unref (iter_fd); - } - } + iov_unload(iobuf->ptr, holder->stub->args.vector, + holder->stub->args.count); + holder->stub->args.vector[0].iov_base = iobuf->ptr; + holder->stub->args.count = 1; - } + iobref_unref(holder->stub->args.iobref); + holder->stub->args.iobref = iobref; - local->file = file; + iobuf_unref(iobuf); - if (file) { - stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, - valid); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } + holder->iobref = iobref_ref(iobref); + } - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } + ptr = holder->stub->args.vector[0].iov_base + holder->write_size; - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_setattr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->setattr, loc, stbuf, - valid); - } + iov_unload(ptr, req->stub->args.vector, req->stub->args.count); - return 0; -unwind: - STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, NULL, NULL); + holder->stub->args.vector[0].iov_len += req->write_size; + holder->write_size += req->write_size; + holder->ordering.size += req->write_size; - if (stub) { - call_stub_destroy (stub); - } + ret = 0; out: - return 0; + return ret; } - -int32_t -wb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, fd_t *fd) +void +__wb_preprocess_winds(wb_inode_t *wb_inode) { - int32_t wbflags = 0, flags = 0; - wb_file_t *file = NULL; - wb_conf_t *conf = NULL; - wb_local_t *local = NULL; + off_t offset_expected = 0; + ssize_t space_left = 0; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *holder = NULL; + wb_conf_t *conf = NULL; + int ret = 0; + ssize_t page_size = 0; + char gfid[64] = { + 0, + }; + + /* With asynchronous IO from a VM guest (as a file), there + can be two sequential writes happening in two regions + of the file. But individual (broken down) IO requests + can arrive interleaved. + + TODO: cycle for each such sequence sifting + through the interleaved ops + */ + + conf = wb_inode->this->private; + page_size = conf->page_size; + + list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo) + { + if (wb_inode->dontsync && req->ordering.lied) { + /* sync has failed. Don't pick lies _again_ for winding + * as winding these lies again will trigger an infinite + * recursion of wb_process_queue being called from a + * failed fulfill. However, pick non-lied requests for + * winding so that application won't block indefinitely + * waiting for write result. + */ + + uuid_utoa_r(req->gfid, gfid); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): not setting ordering.go" + "as dontsync is set", + req->unique, gf_fop_list[req->fop], gfid, req->gen); + + continue; + } + + if (!req->ordering.tempted) { + if (holder) { + if (wb_requests_conflict(holder, req)) + /* do not hold on write if a + dependent write is in queue */ + holder->ordering.go = 1; + } + /* collapse only non-sync writes */ + continue; + } else if (!holder) { + /* holder is always a non-sync write */ + holder = req; + continue; + } + + offset_expected = holder->stub->args.offset + holder->write_size; + + if (req->stub->args.offset != offset_expected) { + holder->ordering.go = 1; + holder = req; + continue; + } + + if (!is_same_lkowner(&req->lk_owner, &holder->lk_owner)) { + holder->ordering.go = 1; + holder = req; + continue; + } + + if (req->fd != holder->fd) { + holder->ordering.go = 1; + holder = req; + continue; + } + + space_left = page_size - holder->write_size; + + if (space_left < req->write_size) { + holder->ordering.go = 1; + holder = req; + continue; + } + + ret = __wb_collapse_small_writes(conf, holder, req); + if (ret) + continue; + + /* collapsed request is as good as wound + (from its p.o.v) + */ + list_del_init(&req->todo); + __wb_fulfill_request(req); - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, out, op_errno, - EINVAL); + /* Only the last @holder in queue which - conf = this->private; + - does not have any non-buffered-writes following it + - has not yet filled its capacity - local = frame->local; - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, - EINVAL); + does not get its 'go' set, in anticipation of the arrival + of consecutive smaller writes. + */ + } - flags = local->flags; - wbflags = local->wbflags; + /* but if trickling writes are enabled, then do not hold back + writes if there are no outstanding requests + */ - if (op_ret != -1) { - file = wb_file_create (this, fd, flags); - if (file == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } + if (conf->trickling_writes && !wb_inode->transit && holder) + holder->ordering.go = 1; - LOCK (&file->lock); - { - /* If O_DIRECT then, we disable caching */ - if (((flags & O_DIRECT) == O_DIRECT) - || ((flags & O_ACCMODE) == O_RDONLY) - || (((flags & O_SYNC) == O_SYNC) - && conf->enable_O_SYNC == _gf_true)) { - file->window_conf = 0; - } - - if (wbflags & GF_OPEN_NOWB) { - file->disabled = 1; - } - } - UNLOCK (&file->lock); - } + if (wb_inode->dontsync > 0) + wb_inode->dontsync--; -out: - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); - return 0; + return; } - -int32_t -wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +int +__wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict, + list_head_t *tasks) { - wb_local_t *local = NULL; - int32_t op_errno = EINVAL; - - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } - - local->flags = flags; - local->wbflags = wbflags; - - frame->local = local; - - STACK_WIND (frame, wb_open_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->open, loc, flags, fd, wbflags); - return 0; + wb_conf_t *conf = NULL; + char gfid[64] = { + 0, + }; + + conf = req->wb_inode->this->private; + + uuid_utoa_r(req->gfid, gfid); + + if ((req->stub->fop != GF_FOP_FLUSH) && + ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) { + if (!req->ordering.lied && list_empty(&conflict->wip)) { + /* If request itself is in liability queue, + * 1. We cannot unwind as the response has already been + * sent. + * 2. We cannot wind till conflict clears up. + * 3. So, skip the request for now. + * 4. Otherwise, resume (unwind) it with error. + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; + if ((req->stub->fop == GF_FOP_TRUNCATE) || + (req->stub->fop == GF_FOP_FTRUNCATE)) { + req->stub->frame->local = NULL; + } + + list_del_init(&req->todo); + list_add_tail(&req->winds, tasks); + + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): A conflicting write " + "request in liability queue has failed " + "to sync (error = \"%s\"), " + "unwinding this request as a failure", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + strerror(req->op_errno)); + + if (req->ordering.tempted) { + /* make sure that it won't be unwound in + * wb_do_unwinds too. Otherwise there'll be + * a double wind. + */ + list_del_init(&req->lie); + + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, " + "gfid=%s, gen=%" PRIu64 + "): " + "removed from liability queue", + req->unique, gf_fop_list[req->fop], gfid, + req->gen); + + __wb_fulfill_request(req); + } + } + } else { + gf_msg_debug(req->wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): A conflicting write request " + "in liability queue has failed to sync " + "(error = \"%s\"). This is an " + "FSYNC/FLUSH and we need to maintain ordering " + "guarantees with other writes in TODO queue. " + "Hence doing nothing now", + req->unique, gf_fop_list[req->fop], gfid, req->gen, + strerror(conflict->op_errno)); + + /* flush and fsync (without conf->resync_after_fsync) act as + barriers. We cannot unwind them out of + order, when there are earlier generation writes just because + there is a conflicting liability with an error. So, wait for + our turn till there are no conflicting liabilities. + + This situation can arise when there liabilities spread across + multiple generations. For eg., consider two writes with + following characterstics: + + 1. they belong to different generations gen1, gen2 and + (gen1 > gen2). + 2. they overlap. + 3. both are liabilities. + 4. gen1 write was attempted to sync, but the attempt failed. + 5. there was no attempt to sync gen2 write yet. + 6. A flush (as part of close) is issued and gets a gen no + gen3. + + In the above scenario, if flush is unwound without waiting + for gen1 and gen2 writes either to be successfully synced or + purged, we end up with these two writes in wb_inode->todo + list forever as there will be no attempt to process the queue + as flush is the last operation. + */ + } -unwind: - STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL); - return 0; + return 0; } +int +__wb_pick_winds(wb_inode_t *wb_inode, list_head_t *tasks, + list_head_t *liabilities) +{ + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *conflict = NULL; + char req_gfid[64] = + { + 0, + }, + conflict_gfid[64] = { + 0, + }; + + list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo) + { + uuid_utoa_r(req->gfid, req_gfid); + + conflict = wb_liability_has_conflict(wb_inode, req); + if (conflict) { + uuid_utoa_r(conflict->gfid, conflict_gfid); + + gf_msg_debug(wb_inode->this->name, 0, + "Not winding request due to a " + "conflicting write in liability queue. " + "REQ: unique=%" PRIu64 + ", fop=%s, " + "gen=%" PRIu64 + ", gfid=%s. " + "CONFLICT: unique=%" PRIu64 + ", fop=%s, " + "gen=%" PRIu64 + ", gfid=%s, " + "conflicts-sync-failed?=%s, " + "conflicts-error=%s", + req->unique, gf_fop_list[req->fop], req->gen, req_gfid, + conflict->unique, gf_fop_list[conflict->fop], + conflict->gen, conflict_gfid, + (conflict->op_ret == 1) ? "yes" : "no", + strerror(conflict->op_errno)); + + if (conflict->op_ret == -1) { + /* There is a conflicting liability which failed + * to sync in previous attempts, resume the req + * and fail, unless its an fsync/flush. + */ -int32_t -wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) -{ - long flags = 0; - wb_file_t *file = NULL; - wb_conf_t *conf = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, out, - op_errno, EINVAL); - - conf = this->private; - if (op_ret != -1) { - if (frame->local) { - flags = (long) frame->local; - } - - file = wb_file_create (this, fd, flags); - if (file == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto out; - } - - LOCK (&file->lock); - { - /* If O_DIRECT then, we disable caching */ - if (frame->local) { - if (((flags & O_DIRECT) == O_DIRECT) - || ((flags & O_ACCMODE) == O_RDONLY) - || (((flags & O_SYNC) == O_SYNC) - && (conf->enable_O_SYNC == _gf_true))) { - file->window_conf = 0; - } - } - } - UNLOCK (&file->lock); + __wb_handle_failed_conflict(req, conflict, tasks); + } else { + /* There is a conflicting liability which was + * not attempted to sync even once. Wait till + * at least one attempt to sync is made. + */ + } + + continue; + } + + if (req->ordering.tempted && !req->ordering.go) { + /* wait some more */ + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 ", fop=%s, gen=%" PRIu64 + ", gfid=%s): ordering.go is not set, " + "hence not winding", + req->unique, gf_fop_list[req->fop], req->gen, + req_gfid); + continue; + } + + if (req->stub->fop == GF_FOP_WRITE) { + conflict = wb_wip_has_conflict(wb_inode, req); + + if (conflict) { + uuid_utoa_r(conflict->gfid, conflict_gfid); + + gf_msg_debug(wb_inode->this->name, 0, + "Not winding write request as " + "a conflicting write is being " + "synced to backend. " + "REQ: unique=%" PRIu64 + " fop=%s," + " gen=%" PRIu64 + ", gfid=%s. " + "CONFLICT: unique=%" PRIu64 + " " + "fop=%s, gen=%" PRIu64 + ", " + "gfid=%s", + req->unique, gf_fop_list[req->fop], req->gen, + req_gfid, conflict->unique, + gf_fop_list[conflict->fop], conflict->gen, + conflict_gfid); + continue; + } + + list_add_tail(&req->wip, &wb_inode->wip); + req->wind_count++; + + if (!req->ordering.tempted) + /* unrefed in wb_writev_cbk */ + req->stub->frame->local = __wb_request_ref(req); + } + + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 + ", fop=%s, gfid=%s, " + "gen=%" PRIu64 + "): picking the request for " + "winding", + req->unique, gf_fop_list[req->fop], req_gfid, req->gen); + + list_del_init(&req->todo); + + if (req->ordering.tempted) { + list_add_tail(&req->winds, liabilities); + } else { + list_add_tail(&req->winds, tasks); } + } - frame->local = NULL; - -out: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); - return 0; + return 0; } - -int32_t -wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd, dict_t *params) +void +wb_do_winds(wb_inode_t *wb_inode, list_head_t *tasks) { - int32_t op_errno = EINVAL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; - frame->local = (void *)(long)flags; + list_for_each_entry_safe(req, tmp, tasks, winds) + { + list_del_init(&req->winds); - STACK_WIND (frame, wb_create_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); - return 0; + if (req->op_ret == -1) { + call_unwind_error_keep_stub(req->stub, req->op_ret, req->op_errno); + } else { + call_resume_keep_stub(req->stub); + } -unwind: - STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - return 0; + wb_request_unref(req); + } } - -/* Mark all the contiguous write requests for winding starting from head of - * request list. Stops marking at the first non-write request found. If - * file is opened with O_APPEND, make sure all the writes marked for winding - * will fit into a single write call to server. - */ -size_t -__wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds) +void +wb_process_queue(wb_inode_t *wb_inode) { - wb_request_t *request = NULL; - size_t size = 0; - char first_request = 1; - off_t offset_expected = 0; - wb_conf_t *conf = NULL; - int count = 0; + list_head_t tasks; + list_head_t lies; + list_head_t liabilities; + int wind_failure = 0; - GF_VALIDATE_OR_GOTO ("write-behind", file, out); - GF_VALIDATE_OR_GOTO (file->this->name, list, out); - GF_VALIDATE_OR_GOTO (file->this->name, winds, out); + INIT_LIST_HEAD(&tasks); + INIT_LIST_HEAD(&lies); + INIT_LIST_HEAD(&liabilities); - conf = file->this->private; + do { + gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG, + "processing queues"); - list_for_each_entry (request, list, list) + LOCK(&wb_inode->lock); { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - break; - } + __wb_preprocess_winds(wb_inode); - if (!request->flags.write_request.stack_wound) { - if (first_request) { - first_request = 0; - offset_expected - = request->stub->args.writev.off; - } - - if (request->stub->args.writev.off != offset_expected) { - break; - } - - if ((file->flags & O_APPEND) - && (((size + request->write_size) - > conf->aggregate_size) - || ((count + request->stub->args.writev.count) - > MAX_VECTOR_COUNT))) { - break; - } - - size += request->write_size; - offset_expected += request->write_size; - file->aggregate_current -= request->write_size; - count += request->stub->args.writev.count; - - request->flags.write_request.stack_wound = 1; - list_add_tail (&request->winds, winds); - } + __wb_pick_winds(wb_inode, &tasks, &liabilities); + + __wb_pick_unwinds(wb_inode, &lies); } + UNLOCK(&wb_inode->lock); -out: - return size; + if (!list_empty(&lies)) + wb_do_unwinds(wb_inode, &lies); + + if (!list_empty(&tasks)) + wb_do_winds(wb_inode, &tasks); + + /* If there is an error in wb_fulfill before winding write + * requests, we would miss invocation of wb_process_queue + * from wb_fulfill_cbk. So, retry processing again. + */ + if (!list_empty(&liabilities)) + wind_failure = wb_fulfill(wb_inode, &liabilities); + } while (wind_failure); + + return; } +void +wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf) +{ + GF_ASSERT(wb_inode); + GF_ASSERT(postbuf); + + LOCK(&wb_inode->lock); + { + wb_inode->size = postbuf->ia_size; + } + UNLOCK(&wb_inode->lock); +} -int32_t -__wb_can_wind (list_head_t *list, char *other_fop_in_queue, - char *non_contiguous_writes, char *incomplete_writes, - char *wind_all) +int +wb_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - wb_request_t *request = NULL; - char first_request = 1; - off_t offset_expected = 0; - int32_t ret = -1; + wb_request_t *req = NULL; + wb_inode_t *wb_inode; - GF_VALIDATE_OR_GOTO ("write-behind", list, out); + req = frame->local; + frame->local = NULL; + wb_inode = req->wb_inode; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - if (request->stub && other_fop_in_queue) { - *other_fop_in_queue = 1; - } - break; - } + LOCK(&req->wb_inode->lock); + { + list_del_init(&req->wip); + } + UNLOCK(&req->wb_inode->lock); - if (request->flags.write_request.stack_wound - && !request->flags.write_request.got_reply - && (incomplete_writes != NULL)) { - *incomplete_writes = 1; - break; - } + wb_request_unref(req); - if (!request->flags.write_request.stack_wound) { - if (first_request) { - char flush = 0; - first_request = 0; - offset_expected - = request->stub->args.writev.off; - - flush = request->flags.write_request.flush_all; - if (wind_all != NULL) { - *wind_all = flush; - } - } - - if (offset_expected != request->stub->args.writev.off) { - if (non_contiguous_writes) { - *non_contiguous_writes = 1; - } - break; - } - - offset_expected += request->write_size; - } - } + /* requests could be pending while this was in progress */ + wb_process_queue(wb_inode); - ret = 0; -out: - return ret; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } - -ssize_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf, - char enable_trickling_writes) +int +wb_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - size_t size = 0; - char other_fop_in_queue = 0; - char incomplete_writes = 0; - char non_contiguous_writes = 0; - wb_request_t *request = NULL; - wb_file_t *file = NULL; - char wind_all = 0; - int32_t ret = 0; - - GF_VALIDATE_OR_GOTO ("write-behind", list, out); - GF_VALIDATE_OR_GOTO ("write-behind", winds, out); - - if (list_empty (list)) { - goto out; - } + STACK_WIND(frame, wb_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; +} - request = list_entry (list->next, typeof (*request), list); - file = request->file; +int +wb_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + wb_conf_t *conf = NULL; + gf_boolean_t wb_disabled = 0; + call_stub_t *stub = NULL; + int ret = -1; + int32_t op_errno = EINVAL; + int o_direct = O_DIRECT; + + conf = this->private; + + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; + } + + if (!conf->strict_O_DIRECT) + o_direct = 0; + + if (fd->flags & (O_SYNC | O_DSYNC | o_direct)) + wb_disabled = 1; + + if (flags & (O_SYNC | O_DSYNC | o_direct)) + wb_disabled = 1; + + if (wb_disabled) + stub = fop_writev_stub(frame, wb_writev_helper, fd, vector, count, + offset, flags, iobref, xdata); + else + stub = fop_writev_stub(frame, NULL, fd, vector, count, offset, flags, + iobref, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } + + if (wb_disabled) + ret = wb_enqueue(wb_inode, stub); + else + ret = wb_enqueue_tempted(wb_inode, stub); + + if (!ret) { + op_errno = ENOMEM; + goto unwind; + } + + wb_process_queue(wb_inode); + + return 0; - ret = __wb_can_wind (list, &other_fop_in_queue, - &non_contiguous_writes, &incomplete_writes, - &wind_all); - if (ret == -1) { - gf_log (file->this->name, GF_LOG_WARNING, - "cannot decide whether to wind or not"); - goto out; - } +unwind: + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); - if (!incomplete_writes && ((enable_trickling_writes) - || (wind_all) || (non_contiguous_writes) - || (other_fop_in_queue) - || (file->aggregate_current - >= aggregate_conf))) { - size = __wb_mark_wind_all (file, list, winds); - } + if (stub) + call_stub_destroy(stub); -out: - return size; + return 0; } - -size_t -__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size) +int +wb_readv_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - size_t written_behind = 0; - wb_request_t *request = NULL; - wb_file_t *file = NULL; + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} - if (list_empty (list)) { - goto out; - } +int +wb_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - request = list_entry (list->next, typeof (*request), list); - file = request->file; + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - list_for_each_entry (request, list, list) - { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE)) { - continue; - } + stub = fop_readv_stub(frame, wb_readv_helper, fd, size, offset, flags, + xdata); + if (!stub) + goto unwind; - if (written_behind <= size) { - if (!request->flags.write_request.write_behind) { - written_behind += request->write_size; - request->flags.write_request.write_behind = 1; - list_add_tail (&request->unwinds, unwinds); - - if (!request->flags.write_request.got_reply) { - file->window_current - += request->write_size; - } - } - } else { - break; - } - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; -out: - return written_behind; -} + wb_process_queue(wb_inode); + return 0; -void -__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds) -{ - wb_request_t *request = NULL; - wb_file_t *file = NULL; +unwind: + STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, NULL); - GF_VALIDATE_OR_GOTO ("write-behind", list, out); - GF_VALIDATE_OR_GOTO ("write-behind", unwinds, out); + if (stub) + call_stub_destroy(stub); + return 0; - if (list_empty (list)) { - goto out; - } +noqueue: + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata); + return 0; +} - request = list_entry (list->next, typeof (*request), list); - file = request->file; +int +wb_flush_bg_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + STACK_DESTROY(frame->root); + return 0; +} - if (file->window_current <= file->window_conf) { - __wb_mark_unwind_till (list, unwinds, - file->window_conf - - file->window_current); - } +int +wb_flush_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + wb_conf_t *conf = NULL; + wb_inode_t *wb_inode = NULL; + call_frame_t *bg_frame = NULL; + int32_t op_errno = 0; + int op_ret = 0; + + conf = this->private; + + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) { + op_ret = -1; + op_errno = EINVAL; + goto unwind; + } + + if (conf->flush_behind) + goto flushbehind; + + STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + +flushbehind: + bg_frame = copy_frame(frame); + if (!bg_frame) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + STACK_WIND(bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + /* fall through */ +unwind: + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL); -out: - return; + return 0; } - -uint32_t -__wb_get_other_requests (list_head_t *list, list_head_t *other_requests) +int +wb_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - wb_request_t *request = NULL; - uint32_t count = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", list, out); - GF_VALIDATE_OR_GOTO ("write-behind", other_requests, out); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - list_for_each_entry (request, list, list) { - if ((request->stub == NULL) - || (request->stub->fop == GF_FOP_WRITE)) { - break; - } + stub = fop_flush_stub(frame, wb_flush_helper, fd, xdata); + if (!stub) + goto unwind; - if (!request->flags.other_requests.marked_for_resume) { - request->flags.other_requests.marked_for_resume = 1; - list_add_tail (&request->other_requests, - other_requests); - count++; - } - } - -out: - return count; -} + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + wb_process_queue(wb_inode); -int32_t -wb_stack_unwind (list_head_t *unwinds) -{ - struct iatt buf = {0,}; - wb_request_t *request = NULL, *dummy = NULL; - call_frame_t *frame = NULL; - wb_local_t *local = NULL; - int ret = 0, write_requests_removed = 0; + return 0; - GF_VALIDATE_OR_GOTO ("write-behind", unwinds, out); +unwind: + STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL); - list_for_each_entry_safe (request, dummy, unwinds, unwinds) { - frame = request->stub->frame; - local = frame->local; + if (stub) + call_stub_destroy(stub); - STACK_UNWIND (frame, local->op_ret, local->op_errno, &buf, - &buf); + return 0; - ret = wb_request_unref (request); - if (ret == 0) { - write_requests_removed++; - } - } +noqueue: + STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +} -out: - return write_requests_removed; +int +wb_fsync_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; } +int +wb_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = EINVAL; -int32_t -wb_resume_other_requests (call_frame_t *frame, wb_file_t *file, - list_head_t *other_requests) -{ - int32_t ret = -1; - wb_request_t *request = NULL, *dummy = NULL; - int32_t fops_removed = 0; - char wind = 0; - call_stub_t *stub = NULL; - - GF_VALIDATE_OR_GOTO ((file ? file->this->name : "write-behind"), frame, - out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); - GF_VALIDATE_OR_GOTO (frame->this->name, other_requests, out); - - if (list_empty (other_requests)) { - ret = 0; - goto out; - } + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - list_for_each_entry_safe (request, dummy, other_requests, - other_requests) { - wind = request->stub->wind; - stub = request->stub; + stub = fop_fsync_stub(frame, wb_fsync_helper, fd, datasync, xdata); + if (!stub) + goto unwind; - LOCK (&file->lock); - { - request->stub = NULL; - } - UNLOCK (&file->lock); + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - if (!wind) { - wb_request_unref (request); - fops_removed++; - } + wb_process_queue(wb_inode); - call_resume (stub); - } + return 0; - ret = 0; +unwind: + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL); - if (fops_removed > 0) { - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } + if (stub) + call_stub_destroy(stub); + return 0; -out: - return ret; +noqueue: + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; } +int +wb_stat_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} -int32_t -wb_do_ops (call_frame_t *frame, wb_file_t *file, list_head_t *winds, - list_head_t *unwinds, list_head_t *other_requests) +int +wb_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - int32_t ret = -1, write_requests_removed = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_VALIDATE_OR_GOTO ((file ? file->this->name : "write-behind"), - frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, file, out); + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; - ret = wb_stack_unwind (unwinds); + stub = fop_stat_stub(frame, wb_stat_helper, loc, xdata); + if (!stub) + goto unwind; - write_requests_removed = ret; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - ret = wb_sync (frame, file, winds); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "syncing of write requests failed"); - } + wb_process_queue(wb_inode); - ret = wb_resume_other_requests (frame, file, other_requests); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cannot resume non-write requests in request queue"); - } + return 0; - /* wb_stack_unwind does wb_request_unref after unwinding a write - * request. Hence if a write-request was just freed in wb_stack_unwind, - * we have to process request queue once again to unblock requests - * blocked on the writes just unwound. - */ - if (write_requests_removed > 0) { - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (frame->this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } +unwind: + STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL); -out: - return ret; + if (stub) + call_stub_destroy(stub); + return 0; + +noqueue: + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; } +int +wb_fstat_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} -inline int -__wb_copy_into_holder (wb_request_t *holder, wb_request_t *request) +int +wb_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - char *ptr = NULL; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - int ret = -1; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - if (holder->flags.write_request.virgin) { - iobuf = iobuf_get (request->file->this->ctx->iobuf_pool); - if (iobuf == NULL) { - goto out; - } + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - iobref = iobref_new (); - if (iobref == NULL) { - iobuf_unref (iobuf); - goto out; - } + stub = fop_fstat_stub(frame, wb_fstat_helper, fd, xdata); + if (!stub) + goto unwind; - ret = iobref_add (iobref, iobuf); - if (ret != 0) { - iobuf_unref (iobuf); - iobref_unref (iobref); - gf_log (request->file->this->name, GF_LOG_WARNING, - "cannot add iobuf (%p) into iobref (%p)", - iobuf, iobref); - goto out; - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - iov_unload (iobuf->ptr, holder->stub->args.writev.vector, - holder->stub->args.writev.count); - holder->stub->args.writev.vector[0].iov_base = iobuf->ptr; + wb_process_queue(wb_inode); - iobref_unref (holder->stub->args.writev.iobref); - holder->stub->args.writev.iobref = iobref; + return 0; - iobuf_unref (iobuf); +unwind: + STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL); - holder->flags.write_request.virgin = 0; - } + if (stub) + call_stub_destroy(stub); + return 0; - ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size; +noqueue: + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} - iov_unload (ptr, request->stub->args.writev.vector, - request->stub->args.writev.count); +int32_t +wb_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT(frame->local); - holder->stub->args.writev.vector[0].iov_len += request->write_size; - holder->write_size += request->write_size; + if (op_ret == 0) + wb_set_inode_size(frame->local, postbuf); - request->flags.write_request.stack_wound = 1; - list_move_tail (&request->list, &request->file->passive_requests); + frame->local = NULL; - ret = 0; -out: - return ret; + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +int +wb_truncate_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + STACK_WIND(frame, wb_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} -/* this procedure assumes that write requests have only one vector to write */ -void -__wb_collapse_write_bufs (list_head_t *requests, size_t page_size) +int +wb_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - off_t offset_expected = 0; - size_t space_left = 0; - wb_request_t *request = NULL, *tmp = NULL, *holder = NULL; - int ret = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", requests, out); + wb_inode = wb_inode_create(this, loc->inode); + if (!wb_inode) + goto unwind; - list_for_each_entry_safe (request, tmp, requests, list) { - if ((request->stub == NULL) - || (request->stub->fop != GF_FOP_WRITE) - || (request->flags.write_request.stack_wound)) { - holder = NULL; - continue; - } + frame->local = wb_inode; - if (request->flags.write_request.write_behind) { - if (holder == NULL) { - holder = request; - continue; - } + stub = fop_truncate_stub(frame, wb_truncate_helper, loc, offset, xdata); + if (!stub) + goto unwind; - offset_expected = holder->stub->args.writev.off - + holder->write_size; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - if (request->stub->args.writev.off != offset_expected) { - holder = request; - continue; - } + wb_process_queue(wb_inode); - space_left = page_size - holder->write_size; + return 0; - if (space_left >= request->write_size) { - ret = __wb_copy_into_holder (holder, request); - if (ret != 0) { - break; - } +unwind: + STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL); - __wb_request_unref (request); - } else { - holder = request; - } - } else { - break; - } - } + if (stub) + call_stub_destroy(stub); -out: - return; + return 0; } - int32_t -wb_process_queue (call_frame_t *frame, wb_file_t *file) +wb_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - list_head_t winds = {0, }, unwinds = {0, }, other_requests = {0, }; - size_t size = 0; - wb_conf_t *conf = NULL; - uint32_t count = 0; - int32_t ret = -1; - - INIT_LIST_HEAD (&winds); - INIT_LIST_HEAD (&unwinds); - INIT_LIST_HEAD (&other_requests); - - GF_VALIDATE_OR_GOTO ((file ? file->this->name : "write-behind"), frame, - out); - GF_VALIDATE_OR_GOTO (file->this->name, frame, out); - - conf = file->this->private; - GF_VALIDATE_OR_GOTO (file->this->name, conf, out); - - size = conf->aggregate_size; - LOCK (&file->lock); - { - /* - * make sure requests are marked for unwinding and adjacent - * contiguous write buffers (each of size less than that of - * an iobuf) are packed properly so that iobufs are filled to - * their maximum capacity, before calling __wb_mark_winds. - */ - __wb_mark_unwinds (&file->request, &unwinds); - - __wb_collapse_write_bufs (&file->request, - file->this->ctx->page_size); + GF_ASSERT(frame->local); - count = __wb_get_other_requests (&file->request, - &other_requests); + if (op_ret == 0) + wb_set_inode_size(frame->local, postbuf); - if (count == 0) { - __wb_mark_winds (&file->request, &winds, size, - conf->enable_trickling_writes); - } - - } - UNLOCK (&file->lock); + frame->local = NULL; - ret = wb_do_ops (frame, file, &winds, &unwinds, &other_requests); - -out: - return ret; + STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +int +wb_ftruncate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + STACK_WIND(frame, wb_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} -int32_t -wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +int +wb_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - GF_ASSERT (frame); + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; + int32_t op_errno = 0; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) { + op_errno = ENOMEM; + goto unwind; + } + frame->local = wb_inode; -int32_t -wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, struct iobref *iobref) -{ - wb_file_t *file = NULL; - char wb_disabled = 0; - call_frame_t *process_frame = NULL; - size_t size = 0; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_local_t *local = NULL; - wb_request_t *request = NULL; - int32_t ret = -1; - int32_t op_ret = -1, op_errno = EINVAL; - - GF_ASSERT (frame); - - GF_VALIDATE_OR_GOTO_WITH_ERROR ("write-behind", this, unwind, op_errno, - EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, - EINVAL); - - if (vector != NULL) - size = iov_length (vector, count); - - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); - } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } - } + stub = fop_ftruncate_stub(frame, wb_ftruncate_helper, fd, offset, xdata); + if (!stub) { + op_errno = ENOMEM; + goto unwind; + } - if (file != NULL) { - LOCK (&file->lock); - { - op_ret = file->op_ret; - op_errno = file->op_errno; - - file->op_ret = 0; - - if ((op_ret == 0) - && (file->disabled || file->disable_till)) { - if (size > file->disable_till) { - file->disable_till = 0; - } else { - file->disable_till -= size; - } - wb_disabled = 1; - } - } - UNLOCK (&file->lock); - } else { - wb_disabled = 1; - } + if (!wb_enqueue(wb_inode, stub)) { + op_errno = ENOMEM; + goto unwind; + } - if (op_ret == -1) { - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, - NULL); - return 0; - } + wb_process_queue(wb_inode); - if (wb_disabled) { - STACK_WIND (frame, wb_writev_cbk, FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->writev, - fd, vector, count, offset, iobref); - return 0; - } + return 0; - process_frame = copy_frame (frame); - if (process_frame == NULL) { - op_errno = ENOMEM; - goto unwind; - } +unwind: + frame->local = NULL; - local = GF_CALLOC (1, sizeof (*local), - gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } + STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - frame->local = local; - local->file = file; + if (stub) + call_stub_destroy(stub); + return 0; +} - stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, iobref); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } +int +wb_setattr_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; +} - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - goto unwind; - } +int +wb_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - ret = wb_process_queue (process_frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; - STACK_DESTROY (process_frame->root); + stub = fop_setattr_stub(frame, wb_setattr_helper, loc, stbuf, valid, xdata); + if (!stub) + goto unwind; - return 0; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; + wb_process_queue(wb_inode); + + return 0; unwind: - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL); + STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); - if (process_frame) { - STACK_DESTROY (process_frame->root); - } + if (stub) + call_stub_destroy(stub); + return 0; - if (stub) { - call_stub_destroy (stub); - } - - return 0; +noqueue: + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; } +int +wb_fsetattr_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; +} -int32_t -wb_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iovec *vector, int32_t count, - struct iatt *stbuf, struct iobref *iobref) +int +wb_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL; - int32_t ret = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_ASSERT (frame); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - local = frame->local; - file = local->file; - request = local->request; + stub = fop_fsetattr_stub(frame, wb_fsetattr_helper, fd, stbuf, valid, + xdata); + if (!stub) + goto unwind; - if ((request != NULL) && (file != NULL)) { - wb_request_unref (request); + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - ret = wb_process_queue (frame, file); - if (ret == -1) { - if (errno == ENOMEM) { - op_ret = -1; - op_errno = ENOMEM; - } + wb_process_queue(wb_inode); - gf_log (frame->this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } + return 0; +unwind: + STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); + if (stub) + call_stub_destroy(stub); + return 0; - return 0; +noqueue: + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; } - -static int32_t -wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) +int32_t +wb_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, wb_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, fd, size, offset); - - return 0; -} + wb_inode_t *wb_inode = NULL; + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) + goto unwind; -int32_t -wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - int32_t ret = -1, op_errno = 0; - wb_request_t *request = NULL; - - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, - op_errno, EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, - EINVAL); - - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); - } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } - } + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; - local->file = file; +unwind: + STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} - frame->local = local; - if (file) { - stub = fop_readv_stub (frame, wb_readv_helper, fd, size, - offset); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } +int32_t +wb_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; - request = wb_enqueue (file, stub); - if (request == NULL) { - call_stub_destroy (stub); - op_errno = ENOMEM; - goto unwind; - } + wb_inode = wb_inode_create(this, fd->inode); + if (!wb_inode) + goto unwind; - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_readv_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readv, - fd, size, offset); - } + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; - return 0; + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL); + return 0; } - int32_t -wb_ffr_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +wb_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) { - STACK_DESTROY (frame->root); - return 0; + if (op_ret == 0) { + wb_inode_t *wb_inode = wb_inode_ctx_get(this, inode); + if (wb_inode) + wb_set_inode_size(wb_inode, buf); + } + + STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; } +int +wb_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} int32_t -wb_ffr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +wb_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_ASSERT (frame); + wb_inode = wb_inode_ctx_get(this, loc->inode); + if (!wb_inode) + goto noqueue; - local = frame->local; - file = local->file; + stub = fop_lookup_stub(frame, wb_lookup_helper, loc, xdata); + if (!stub) + goto unwind; - if (file != NULL) { - LOCK (&file->lock); - { - if (file->op_ret == -1) { - op_ret = file->op_ret; - op_errno = file->op_errno; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - file->op_ret = 0; - } - } - UNLOCK (&file->lock); - } + wb_process_queue(wb_inode); - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); + return 0; - return 0; +unwind: + if (stub) + call_stub_destroy(stub); + + STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); + return 0; + +noqueue: + STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; } +static void +wb_mark_readdirp_start(xlator_t *this, inode_t *directory) +{ + wb_inode_t *wb_directory_inode = NULL; -int32_t -wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) + wb_directory_inode = wb_inode_create(this, directory); + + if (!wb_directory_inode) + return; + + LOCK(&wb_directory_inode->lock); + { + GF_ATOMIC_INC(wb_directory_inode->readdirps); + } + UNLOCK(&wb_directory_inode->lock); + + return; +} + +static void +wb_mark_readdirp_end(xlator_t *this, inode_t *directory) { - wb_conf_t *conf = NULL; - wb_local_t *local = NULL; - wb_file_t *file = NULL; - call_frame_t *flush_frame = NULL, *process_frame = NULL; - int32_t op_ret = -1, op_errno = -1, ret = -1; + wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL; + int readdirps = 0; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, - op_errno, EINVAL); + wb_directory_inode = wb_inode_ctx_get(this, directory); - conf = this->private; + if (!wb_directory_inode) + return; - local = frame->local; - file = local->file; + LOCK(&wb_directory_inode->lock); + { + readdirps = GF_ATOMIC_DEC(wb_directory_inode->readdirps); + if (readdirps) + goto unlock; - LOCK (&file->lock); + list_for_each_entry_safe(wb_inode, tmp, + &wb_directory_inode->invalidate_list, + invalidate_list) { - op_ret = file->op_ret; - op_errno = file->op_errno; + list_del_init(&wb_inode->invalidate_list); + GF_ATOMIC_INIT(wb_inode->invalidate, 0); + inode_unref(wb_inode->inode); } - UNLOCK (&file->lock); + } +unlock: + UNLOCK(&wb_directory_inode->lock); - if (local && local->request) { - process_frame = copy_frame (frame); - if (process_frame == NULL) { - op_errno = ENOMEM; - goto unwind; - } + return; +} - wb_request_unref (local->request); - } +int32_t +wb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + gf_dirent_t *entry = NULL; + inode_t *inode = NULL; + fd_t *fd = NULL; - if (conf->flush_behind) { - flush_frame = copy_frame (frame); - if (flush_frame == NULL) { - op_errno = ENOMEM; - goto unwind; - } + fd = frame->local; + frame->local = NULL; - STACK_WIND (flush_frame, wb_ffr_bg_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd); - } else { - STACK_WIND (frame, wb_ffr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd); - } + if (op_ret <= 0) + goto unwind; - if (process_frame != NULL) { - ret = wb_process_queue (process_frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } + list_for_each_entry(entry, &entries->list, list) + { + if (!entry->inode || !IA_ISREG(entry->d_stat.ia_type)) + continue; - STACK_DESTROY (process_frame->root); - } + wb_inode = wb_inode_ctx_get(this, entry->inode); + if (!wb_inode) + continue; + + LOCK(&wb_inode->lock); + { + if (!list_empty(&wb_inode->liability) || + GF_ATOMIC_GET(wb_inode->invalidate)) { + inode = entry->inode; - if (conf->flush_behind) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); + entry->inode = NULL; + memset(&entry->d_stat, 0, sizeof(entry->d_stat)); + } } + UNLOCK(&wb_inode->lock); - return 0; + if (inode) { + inode_unref(inode); + inode = NULL; + } + } unwind: - STACK_UNWIND_STRICT (flush, frame, -1, op_errno); - return 0; -} + wb_mark_readdirp_end(this, fd->inode); + frame->local = NULL; + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} int32_t -wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +wb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) { - wb_conf_t *conf = NULL; - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - call_frame_t *flush_frame = NULL; - wb_request_t *request = NULL; - int32_t ret = 0, op_errno = 0; + wb_mark_readdirp_start(this, fd->inode); - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, - op_errno, EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, - EINVAL); + frame->local = fd; - conf = this->private; + STACK_WIND(frame, wb_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; +} - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); - } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } - } +int32_t +wb_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; +} - if (file != NULL) { - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } +int32_t +wb_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - local->file = file; + wb_inode = wb_inode_ctx_get(this, oldloc->inode); + if (!wb_inode) + goto noqueue; - frame->local = local; + stub = fop_link_stub(frame, wb_link_helper, oldloc, newloc, xdata); + if (!stub) + goto unwind; - stub = fop_flush_stub (frame, wb_flush_helper, fd); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - request = wb_enqueue (file, stub); - if (request == NULL) { - call_stub_destroy (stub); - op_errno = ENOMEM; - goto unwind; - } + wb_process_queue(wb_inode); - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - if (conf->flush_behind) { - flush_frame = copy_frame (frame); - if (flush_frame == NULL) { - op_errno = ENOMEM; - goto unwind; - } - - STACK_UNWIND_STRICT (flush, frame, 0, 0); - - STACK_WIND (flush_frame, wb_ffr_bg_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd); - } else { - STACK_WIND (frame, wb_ffr_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, fd); - } - } - - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (flush, frame, -1, op_errno); - return 0; + STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + + if (stub) + call_stub_destroy(stub); + + return 0; + +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; } +int32_t +wb_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, + len, xdata); + return 0; +} -static int32_t -wb_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf) +int32_t +wb_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) { - wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_request_t *request = NULL; - int32_t ret = -1; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_ASSERT (frame); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - local = frame->local; - file = local->file; - request = local->request; + stub = fop_fallocate_stub(frame, wb_fallocate_helper, fd, keep_size, offset, + len, xdata); + if (!stub) + goto unwind; - if (file != NULL) { - LOCK (&file->lock); - { - if (file->op_ret == -1) { - op_ret = file->op_ret; - op_errno = file->op_errno; + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - file->op_ret = 0; - } - } - UNLOCK (&file->lock); - - if (request) { - wb_request_unref (request); - ret = wb_process_queue (frame, file); - if (ret == -1) { - if (errno == ENOMEM) { - op_ret = -1; - op_errno = ENOMEM; - } - - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } + wb_process_queue(wb_inode); - } + return 0; - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf); +unwind: + STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); - return 0; -} + if (stub) + call_stub_destroy(stub); + return 0; -static int32_t -wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) -{ - STACK_WIND (frame, wb_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, datasync); - return 0; +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, + len, xdata); + return 0; } +int32_t +wb_discard_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; +} int32_t -wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) +wb_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - wb_file_t *file = NULL; - wb_local_t *local = NULL; - uint64_t tmp_file = 0; - call_stub_t *stub = NULL; - wb_request_t *request = NULL; - int32_t ret = -1, op_errno = 0; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_ASSERT (frame); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, - op_errno, EINVAL); - GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, fd, unwind, - op_errno, EINVAL); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; + stub = fop_discard_stub(frame, wb_discard_helper, fd, offset, len, xdata); + if (!stub) + goto unwind; - if ((!IA_ISDIR (fd->inode->ia_type)) - && fd_ctx_get (fd, this, &tmp_file)) { - file = wb_file_create (this, fd, 0); - } else { - file = (wb_file_t *)(long)tmp_file; - if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) { - gf_log (this->name, GF_LOG_WARNING, - "wb_file not found for fd %p", fd); - op_errno = EBADFD; - goto unwind; - } - } + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t); - if (local == NULL) { - op_errno = ENOMEM; - goto unwind; - } + wb_process_queue(wb_inode); - local->file = file; + return 0; - frame->local = local; +unwind: + STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); - if (file) { - stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync); - if (stub == NULL) { - op_errno = ENOMEM; - goto unwind; - } + if (stub) + call_stub_destroy(stub); + return 0; - request = wb_enqueue (file, stub); - if (request == NULL) { - op_errno = ENOMEM; - call_stub_destroy (stub); - goto unwind; - } +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); - ret = wb_process_queue (frame, file); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "request queue processing failed"); - } - } else { - STACK_WIND (frame, wb_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, datasync); - } - - return 0; - -unwind: - STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } +int32_t +wb_zerofill_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, + fd, offset, len, xdata); + return 0; +} int32_t -wb_release (xlator_t *this, fd_t *fd) +wb_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) { - uint64_t file_ptr = 0; - wb_file_t *file = NULL; + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); - GF_VALIDATE_OR_GOTO (this->name, fd, out); + wb_inode = wb_inode_ctx_get(this, fd->inode); + if (!wb_inode) + goto noqueue; - fd_ctx_get (fd, this, &file_ptr); - file = (wb_file_t *) (long) file_ptr; + stub = fop_zerofill_stub(frame, wb_zerofill_helper, fd, offset, len, xdata); + if (!stub) + goto unwind; - if (file != NULL) { - LOCK (&file->lock); - { - GF_ASSERT (list_empty (&file->request)); - } - UNLOCK (&file->lock); + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - wb_file_destroy (file); - } + wb_process_queue(wb_inode); -out: - return 0; -} + return 0; +unwind: + STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); -int -wb_priv_dump (xlator_t *this) -{ - wb_conf_t *conf = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; - int ret = -1; + if (stub) + call_stub_destroy(stub); - GF_VALIDATE_OR_GOTO ("write-behind", this, out); +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill, + fd, offset, len, xdata); + return 0; +} - conf = this->private; - GF_VALIDATE_OR_GOTO (this->name, conf, out); +int32_t +wb_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + call_stub_t *stub = NULL; - gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", - "priv"); + wb_inode = wb_inode_ctx_get(this, oldloc->inode); + if (!wb_inode) + goto noqueue; - gf_proc_dump_add_section (key_prefix); + stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata); + if (!stub) + goto unwind; - gf_proc_dump_write ("aggregate_size", "%d", conf->aggregate_size); - gf_proc_dump_write ("window_size", "%d", conf->window_size); - gf_proc_dump_write ("disable_till", "%d", conf->disable_till); - gf_proc_dump_write ("enable_O_SYNC", "%d", conf->enable_O_SYNC); - gf_proc_dump_write ("flush_behind", "%d", conf->flush_behind); - gf_proc_dump_write ("enable_trickling_writes", "%d", - conf->enable_trickling_writes); + if (!wb_enqueue(wb_inode, stub)) + goto unwind; - ret = 0; -out: - return ret; -} + wb_process_queue(wb_inode); + return 0; -void -__wb_dump_requests (struct list_head *head, char *prefix, char passive) -{ - char key[GF_DUMP_MAX_BUF_LEN] = {0, }; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }, flag = 0; - wb_request_t *request = NULL; +unwind: + if (stub) + call_stub_destroy(stub); - list_for_each_entry (request, head, list) { - gf_proc_dump_build_key (key, prefix, passive ? "passive-request" - : "active-request"); - gf_proc_dump_build_key (key_prefix, key, - gf_fop_list[request->fop]); + STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL, + NULL); - gf_proc_dump_add_section(key_prefix); + return 0; - gf_proc_dump_write ("request-ptr", "%p", request); +noqueue: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} - gf_proc_dump_write ("refcount", "%d", request->refcount); +int +wb_forget(xlator_t *this, inode_t *inode) +{ + uint64_t tmp = 0; + wb_inode_t *wb_inode = NULL; - if (request->fop == GF_FOP_WRITE) { - flag = request->flags.write_request.stack_wound; - gf_proc_dump_write ("stack_wound", "%d", flag); + inode_ctx_del(inode, this, &tmp); - gf_proc_dump_write ("size", "%"GF_PRI_SIZET, - request->write_size); + wb_inode = (wb_inode_t *)(long)tmp; - gf_proc_dump_write ("offset", "%"PRId64, - request->stub->args.writev.off); + if (!wb_inode) + return 0; - flag = request->flags.write_request.write_behind; - gf_proc_dump_write ("write_behind", "%d", flag); + wb_inode_destroy(wb_inode); - flag = request->flags.write_request.got_reply; - gf_proc_dump_write ("got_reply", "%d", flag); + return 0; +} - flag = request->flags.write_request.virgin; - gf_proc_dump_write ("virgin", "%d", flag); +int +wb_release(xlator_t *this, fd_t *fd) +{ + uint64_t tmp = 0; - flag = request->flags.write_request.flush_all; - gf_proc_dump_write ("flush_all", "%d", flag); - } else { - flag = request->flags.other_requests.marked_for_resume; - gf_proc_dump_write ("marked_for_resume", "%d", flag); - } - } -} + (void)fd_ctx_del(fd, this, &tmp); + return 0; +} int -wb_file_dump (xlator_t *this, fd_t *fd) +wb_priv_dump(xlator_t *this) { - wb_file_t *file = NULL; - uint64_t tmp_file = 0; - int32_t ret = -1; - char *path = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; + wb_conf_t *conf = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + int ret = -1; - if ((fd == NULL) || (this == NULL)) { - ret = 0; - goto out; - } + GF_VALIDATE_OR_GOTO("write-behind", this, out); - ret = fd_ctx_get (fd, this, &tmp_file); - if (ret == -1) { - ret = 0; - goto out; - } + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); - file = (wb_file_t *)(long)tmp_file; - if (file == NULL) { - ret = 0; - goto out; - } + gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind", + "priv"); - gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", - "file"); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_add_section (key_prefix); + gf_proc_dump_write("aggregate_size", "%" PRIu64, conf->aggregate_size); + gf_proc_dump_write("window_size", "%" PRIu64, conf->window_size); + gf_proc_dump_write("flush_behind", "%d", conf->flush_behind); + gf_proc_dump_write("trickling_writes", "%d", conf->trickling_writes); - __inode_path (fd->inode, NULL, &path); - if (path != NULL) { - gf_proc_dump_write ("path", "%s", path); - GF_FREE (path); - } + ret = 0; +out: + return ret; +} - gf_proc_dump_write ("fd", "%p", fd); +void +__wb_dump_requests(struct list_head *head, char *prefix) +{ + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = + { + 0, + }, + flag = 0; + wb_request_t *req = NULL; - gf_proc_dump_write ("disabled", "%d", file->disabled); + list_for_each_entry(req, head, all) + { + gf_proc_dump_build_key(key_prefix, key, "%s", + (char *)gf_fop_list[req->fop]); - gf_proc_dump_write ("disable_till", "%lu", file->disable_till); + gf_proc_dump_add_section("%s", key_prefix); - gf_proc_dump_write ("window_conf", "%"GF_PRI_SIZET, file->window_conf); + gf_proc_dump_write("unique", "%" PRIu64, req->unique); - gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET, file->window_current); + gf_proc_dump_write("refcount", "%d", req->refcount); - gf_proc_dump_write ("flags", "%s", (file->flags & O_APPEND) ? "O_APPEND" - : "!O_APPEND"); + if (list_empty(&req->todo)) + gf_proc_dump_write("wound", "yes"); + else + gf_proc_dump_write("wound", "no"); - gf_proc_dump_write ("aggregate_current", "%"GF_PRI_SIZET, file->aggregate_current); + gf_proc_dump_write("generation-number", "%" PRIu64, req->gen); - gf_proc_dump_write ("refcount", "%d", file->refcount); + gf_proc_dump_write("req->op_ret", "%d", req->op_ret); + gf_proc_dump_write("req->op_errno", "%d", req->op_errno); + gf_proc_dump_write("sync-attempts", "%d", req->wind_count); - gf_proc_dump_write ("op_ret", "%d", file->op_ret); + if (req->fop == GF_FOP_WRITE) { + if (list_empty(&req->wip)) + gf_proc_dump_write("sync-in-progress", "no"); + else + gf_proc_dump_write("sync-in-progress", "yes"); - gf_proc_dump_write ("op_errno", "%d", file->op_errno); + gf_proc_dump_write("size", "%" GF_PRI_SIZET, req->write_size); - LOCK (&file->lock); - { - if (!list_empty (&file->request)) { - __wb_dump_requests (&file->request, key_prefix, 0); - } + if (req->stub) + gf_proc_dump_write("offset", "%" PRId64, + req->stub->args.offset); - if (!list_empty (&file->passive_requests)) { - __wb_dump_requests (&file->passive_requests, key_prefix, - 1); - } + flag = req->ordering.lied; + gf_proc_dump_write("lied", "%d", flag); + + flag = req->ordering.append; + gf_proc_dump_write("append", "%d", flag); + + flag = req->ordering.fulfilled; + gf_proc_dump_write("fulfilled", "%d", flag); + + flag = req->ordering.go; + gf_proc_dump_write("go", "%d", flag); } - UNLOCK (&file->lock); + } +} + +int +wb_inode_dump(xlator_t *this, inode_t *inode) +{ + wb_inode_t *wb_inode = NULL; + int32_t ret = -1; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char uuid_str[64] = { + 0, + }; + + if ((inode == NULL) || (this == NULL)) { + ret = 0; + goto out; + } + wb_inode = wb_inode_ctx_get(this, inode); + if (wb_inode == NULL) { ret = 0; -out: - return ret; -} + goto out; + } + uuid_utoa_r(inode->gfid, uuid_str); -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; + gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind", + "wb_inode"); - if (!this) { - goto out; - } + gf_proc_dump_add_section("%s", key_prefix); - ret = xlator_mem_acct_init (this, gf_wb_mt_end + 1); + __inode_path(inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - } + gf_proc_dump_write("inode", "%p", inode); -out: - return ret; -} + gf_proc_dump_write("gfid", "%s", uuid_str); + gf_proc_dump_write("window_conf", "%" GF_PRI_SIZET, wb_inode->window_conf); -int -reconfigure (xlator_t *this, dict_t *options) -{ - wb_conf_t *conf = NULL; - int ret = -1; + gf_proc_dump_write("window_current", "%" GF_PRI_SIZET, + wb_inode->window_current); - conf = this->private; + gf_proc_dump_write("transit-size", "%" GF_PRI_SIZET, wb_inode->transit); - GF_OPTION_RECONF ("cache-size", conf->window_size, options, size, out); + gf_proc_dump_write("dontsync", "%d", wb_inode->dontsync); - GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, - out); + ret = TRY_LOCK(&wb_inode->lock); + if (!ret) { + if (!list_empty(&wb_inode->all)) { + __wb_dump_requests(&wb_inode->all, key_prefix); + } + UNLOCK(&wb_inode->lock); + } - ret = 0; + if (ret && wb_inode) + gf_proc_dump_write("Unable to dump the inode information", + "(Lock acquisition failed) %p (gfid: %s)", wb_inode, + uuid_str); + + ret = 0; out: - return ret; + return ret; } - -int32_t -init (xlator_t *this) +int +mem_acct_init(xlator_t *this) { - wb_conf_t *conf = NULL; - int32_t ret = -1; + int ret = -1; - if ((this->children == NULL) - || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: write-behind (%s) not configured with exactly " - "one child", this->name); - goto out; - } + if (!this) { + goto out; + } - if (this->parents == NULL) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_wb_mt_wb_conf_t); - if (conf == NULL) { - goto out; - } + ret = xlator_mem_acct_init(this, gf_wb_mt_end + 1); - GF_OPTION_INIT("enable-O_SYNC", conf->enable_O_SYNC, bool, out); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, WRITE_BEHIND_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + } - /* configure 'options aggregate-size <size>' */ - conf->aggregate_size = WB_AGGREGATE_SIZE; +out: + return ret; +} - GF_OPTION_INIT("disable-for-first-nbytes", conf->disable_till, size, - out); +int +reconfigure(xlator_t *this, dict_t *options) +{ + wb_conf_t *conf = NULL; + int ret = -1; - /* configure 'option window-size <size>' */ - GF_OPTION_INIT ("cache-size", conf->window_size, size, out); + conf = this->private; - if (!conf->window_size && conf->aggregate_size) { - gf_log (this->name, GF_LOG_WARNING, - "setting window-size to be equal to " - "aggregate-size(%"PRIu64")", - conf->aggregate_size); - conf->window_size = conf->aggregate_size; - } + GF_OPTION_RECONF("cache-size", conf->window_size, options, size_uint64, + out); - if (conf->window_size < conf->aggregate_size) { - gf_log (this->name, GF_LOG_ERROR, - "aggregate-size(%"PRIu64") cannot be more than " - "window-size(%"PRIu64")", conf->aggregate_size, - conf->window_size); - goto out; - } + GF_OPTION_RECONF("flush-behind", conf->flush_behind, options, bool, out); - /* configure 'option flush-behind <on/off>' */ - GF_OPTION_INIT ("flush-behind", conf->flush_behind, bool, out); + GF_OPTION_RECONF("trickling-writes", conf->trickling_writes, options, bool, + out); - GF_OPTION_INIT ("enable-trickling-writes", conf->enable_trickling_writes, - bool, out); + GF_OPTION_RECONF("strict-O_DIRECT", conf->strict_O_DIRECT, options, bool, + out); - this->private = conf; - ret = 0; + GF_OPTION_RECONF("strict-write-ordering", conf->strict_write_ordering, + options, bool, out); + GF_OPTION_RECONF("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, options, bool, out); + ret = 0; out: - if (ret) { - if (conf) - GF_FREE (conf); - } - return ret; + return ret; } +int32_t +init(xlator_t *this) +{ + wb_conf_t *conf = NULL; + int32_t ret = -1; + + if ((this->children == NULL) || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_INIT_FAILED, + "FATAL: write-behind (%s) not configured with exactly " + "one child", + this->name); + goto out; + } + + if (this->parents == NULL) { + gf_msg(this->name, GF_LOG_WARNING, 0, + WRITE_BEHIND_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfilex"); + } + + conf = GF_CALLOC(1, sizeof(*conf), gf_wb_mt_wb_conf_t); + if (conf == NULL) { + goto out; + } + + /* configure 'options aggregate-size <size>' */ + GF_OPTION_INIT("aggregate-size", conf->aggregate_size, size_uint64, out); + conf->page_size = conf->aggregate_size; + + /* configure 'option window-size <size>' */ + GF_OPTION_INIT("cache-size", conf->window_size, size_uint64, out); + + if (!conf->window_size && conf->aggregate_size) { + gf_msg(this->name, GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_SIZE_NOT_SET, + "setting window-size to be equal to " + "aggregate-size(%" PRIu64 ")", + conf->aggregate_size); + conf->window_size = conf->aggregate_size; + } + + if (conf->window_size < conf->aggregate_size) { + gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE, + "aggregate-size(%" PRIu64 + ") cannot be more than " + "window-size(%" PRIu64 ")", + conf->aggregate_size, conf->window_size); + goto out; + } + + /* configure 'option flush-behind <on/off>' */ + GF_OPTION_INIT("flush-behind", conf->flush_behind, bool, out); + + GF_OPTION_INIT("trickling-writes", conf->trickling_writes, bool, out); + + GF_OPTION_INIT("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out); + + GF_OPTION_INIT("strict-write-ordering", conf->strict_write_ordering, bool, + out); + + GF_OPTION_INIT("resync-failed-syncs-after-fsync", conf->resync_after_fsync, + bool, out); + + this->private = conf; + ret = 0; + +out: + if (ret) { + GF_FREE(conf); + } + return ret; +} void -fini (xlator_t *this) +fini(xlator_t *this) { - wb_conf_t *conf = NULL; + wb_conf_t *conf = NULL; - GF_VALIDATE_OR_GOTO ("write-behind", this, out); + GF_VALIDATE_OR_GOTO("write-behind", this, out); - conf = this->private; - if (!conf) { - goto out; - } + conf = this->private; + if (!conf) { + goto out; + } - this->private = NULL; - GF_FREE (conf); + this->private = NULL; + GF_FREE(conf); out: - return; + return; } - struct xlator_fops fops = { - .writev = wb_writev, - .open = wb_open, - .create = wb_create, - .readv = wb_readv, - .flush = wb_flush, - .fsync = wb_fsync, - .stat = wb_stat, - .fstat = wb_fstat, - .truncate = wb_truncate, - .ftruncate = wb_ftruncate, - .setattr = wb_setattr, + .writev = wb_writev, + .readv = wb_readv, + .flush = wb_flush, + .fsync = wb_fsync, + .stat = wb_stat, + .fstat = wb_fstat, + .truncate = wb_truncate, + .ftruncate = wb_ftruncate, + .setattr = wb_setattr, + .fsetattr = wb_fsetattr, + .lookup = wb_lookup, + .readdirp = wb_readdirp, + .link = wb_link, + .fallocate = wb_fallocate, + .discard = wb_discard, + .zerofill = wb_zerofill, + .rename = wb_rename, }; -struct xlator_cbks cbks = { - .release = wb_release -}; +struct xlator_cbks cbks = {.forget = wb_forget, .release = wb_release}; struct xlator_dumpops dumpops = { - .priv = wb_priv_dump, - .fdctx = wb_file_dump, + .priv = wb_priv_dump, + .inodectx = wb_inode_dump, }; struct volume_options options[] = { - { .key = {"flush-behind"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "If this option is set ON, instructs write-behind " - "translator to perform flush in background, by " - "returning success (or any errors, if any of " - "previous writes were failed) to application even " - "before flush is sent to backend filesystem. " - }, - { .key = {"cache-size", "window-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 512 * GF_UNIT_KB, - .max = 1 * GF_UNIT_GB, - .default_value = "1MB", - .description = "Size of the per-file write-behind buffer. " + { + .key = {"write-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable write-behind", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"flush-behind"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "If this option is set ON, instructs write-behind " + "translator to perform flush in background, by " + "returning success (or any errors, if any of " + "previous writes were failed) to application even " + "before flush FOP is sent to backend filesystem. "}, + {.key = {"cache-size", "window-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 512 * GF_UNIT_KB, + .max = 1 * GF_UNIT_GB, + .default_value = "1MB", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "Size of the write-behind buffer for a single file " + "(inode)."}, + { + .key = {"trickling-writes"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {GD_OP_VERSION_3_13_1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .default_value = "on", + }, + {.key = {"strict-O_DIRECT"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "This option when set to off, ignores the " + "O_DIRECT flag."}, + { + .key = {"strict-write-ordering"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {2}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "Do not let later writes overtake earlier writes even " + "if they do not overlap", + }, + { + .key = {"resync-failed-syncs-after-fsync"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .op_version = {GD_OP_VERSION_3_7_7}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"write-behind"}, + .description = "If sync of \"cached-writes issued before fsync\" " + "(to backend) fails, this option configures whether " + "to retry syncing them after fsync or forget them. " + "If set to on, cached-writes are retried " + "till a \"flush\" fop (or a successful sync) on sync " + "failures. " + "fsync itself is failed irrespective of the value of " + "this option. ", + }, + { + .key = {"aggregate-size"}, + .type = GF_OPTION_TYPE_SIZET, + .default_value = "128KB", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .description = "Will aggregate writes until data of specified " + "size is fully filled for a single file provided " + "there are no dependent fops on cached writes. This " + "option just sets the aggregate size. Note that " + "aggregation won't happen if " + "performance.write-behind-trickling-writes" + " is turned on. Hence turn off " + "performance.write-behind.trickling-writes" + " so that writes are aggregated till a max of " + "\"aggregate-size\" bytes", + }, + {.key = {NULL}}, +}; - }, - { .key = {"disable-for-first-nbytes"}, - .type = GF_OPTION_TYPE_SIZET, - .min = 0, - .max = 1 * GF_UNIT_MB, - .default_value = "0", - }, - { .key = {"enable-O_SYNC"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {"enable-trickling-writes"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {NULL} }, +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "write-behind", + .category = GF_MAINTAINED, }; |
