summaryrefslogtreecommitdiffstats
path: root/xlators/performance/io-cache/src/io-cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/performance/io-cache/src/io-cache.c')
-rw-r--r--xlators/performance/io-cache/src/io-cache.c1478
1 files changed, 1478 insertions, 0 deletions
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
new file mode 100644
index 000000000..f367cdb88
--- /dev/null
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -0,0 +1,1478 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "io-cache.h"
+#include <assert.h>
+#include <sys/time.h>
+
+static uint32_t
+ioc_get_priority (ioc_table_t *table,
+ const char *path);
+
+static uint32_t
+ioc_get_priority (ioc_table_t *table,
+ const char *path);
+
+static inline ioc_inode_t *
+ioc_inode_reupdate (ioc_inode_t *ioc_inode)
+{
+ ioc_table_t *table = ioc_inode->table;
+
+ list_add_tail (&ioc_inode->inode_lru,
+ &table->inode_lru[ioc_inode->weight]);
+
+ return ioc_inode;
+}
+
+static inline ioc_inode_t *
+ioc_get_inode (dict_t *dict,
+ char *name)
+{
+ ioc_inode_t *ioc_inode = NULL;
+ data_t *ioc_inode_data = dict_get (dict, name);
+ ioc_table_t *table = NULL;
+
+ if (ioc_inode_data) {
+ ioc_inode = data_to_ptr (ioc_inode_data);
+ table = ioc_inode->table;
+
+ ioc_table_lock (table);
+ {
+ if (list_empty (&ioc_inode->inode_lru)) {
+ ioc_inode = ioc_inode_reupdate (ioc_inode);
+ }
+ }
+ ioc_table_unlock (table);
+ }
+
+ return ioc_inode;
+}
+
+int32_t
+ioc_inode_need_revalidate (ioc_inode_t *ioc_inode)
+{
+ int8_t need_revalidate = 0;
+ struct timeval tv = {0,};
+ int32_t ret = -1;
+ ioc_table_t *table = ioc_inode->table;
+
+ ret = gettimeofday (&tv, NULL);
+
+ if (time_elapsed (&tv, &ioc_inode->tv) >= table->cache_timeout)
+ need_revalidate = 1;
+
+ return need_revalidate;
+}
+
+/*
+ * __ioc_inode_flush - flush all the cached pages of the given inode
+ *
+ * @ioc_inode:
+ *
+ * assumes lock is held
+ */
+int32_t
+__ioc_inode_flush (ioc_inode_t *ioc_inode)
+{
+ ioc_page_t *curr = NULL, *next = NULL;
+ int32_t destroy_size = 0;
+ int32_t ret = 0;
+
+ list_for_each_entry_safe (curr, next, &ioc_inode->pages, pages) {
+ ret = ioc_page_destroy (curr);
+
+ if (ret != -1)
+ destroy_size += ret;
+ }
+
+ return destroy_size;
+}
+
+void
+ioc_inode_flush (ioc_inode_t *ioc_inode)
+{
+ int32_t destroy_size = 0;
+
+ ioc_inode_lock (ioc_inode);
+ {
+ destroy_size = __ioc_inode_flush (ioc_inode);
+ }
+ ioc_inode_unlock (ioc_inode);
+
+ if (destroy_size) {
+ ioc_table_lock (ioc_inode->table);
+ {
+ ioc_inode->table->cache_used -= destroy_size;
+ }
+ ioc_table_unlock (ioc_inode->table);
+ }
+
+ return;
+}
+
+/*
+ * ioc_utimens_cbk -
+ *
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @stbuf:
+ *
+ */
+int32_t
+ioc_utimens_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+ return 0;
+}
+
+/*
+ * ioc_utimens -
+ *
+ * @frame:
+ * @this:
+ * @loc:
+ * @tv:
+ *
+ */
+int32_t
+ioc_utimens (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct timespec *tv)
+{
+ uint64_t ioc_inode = 0;
+ inode_ctx_get (loc->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND (frame, ioc_utimens_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->utimens,
+ loc, tv);
+ return 0;
+}
+
+int32_t
+ioc_lookup_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct stat *stbuf,
+ dict_t *dict)
+{
+ ioc_inode_t *ioc_inode = NULL;
+ ioc_local_t *local = frame->local;
+ ioc_table_t *table = this->private;
+ ioc_page_t *page = NULL;
+ data_t *page_data = NULL;
+ data_t *content_data = NULL;
+ char *src = NULL;
+ char *dst = NULL;
+ char need_unref = 0;
+ uint8_t cache_still_valid = 0;
+ uint32_t weight = 0;
+ uint64_t tmp_ioc_inode = 0;
+ char *buf = NULL;
+ char *tmp = NULL;
+ int i;
+
+ if (op_ret != 0)
+ goto out;
+
+ inode_ctx_get (inode, this, &tmp_ioc_inode);
+ ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+ if (ioc_inode) {
+ cache_still_valid = ioc_cache_still_valid (ioc_inode,
+ stbuf);
+
+ if (!cache_still_valid) {
+ ioc_inode_flush (ioc_inode);
+ }
+ /* update the time-stamp of revalidation */
+ ioc_inode_lock (ioc_inode);
+ {
+ gettimeofday (&ioc_inode->tv, NULL);
+ }
+ ioc_inode_unlock (ioc_inode);
+
+ ioc_table_lock (ioc_inode->table);
+ {
+ list_move_tail (&ioc_inode->inode_lru,
+ &table->inode_lru[ioc_inode->weight]);
+ }
+ ioc_table_unlock (ioc_inode->table);
+ }
+
+ if (local && stbuf->st_size &&
+ local->need_xattr >= stbuf->st_size) {
+ if (!ioc_inode) {
+ weight = ioc_get_priority (table,
+ local->file_loc.path);
+ ioc_inode = ioc_inode_update (table,
+ inode, weight);
+ inode_ctx_put (inode, this,
+ (uint64_t)(long)ioc_inode);
+ }
+
+ ioc_inode_lock (ioc_inode);
+ {
+ content_data = dict_get (dict, "glusterfs.content");
+ page = ioc_page_get (ioc_inode, 0);
+
+ if (content_data) {
+ if (page) {
+ dict_unref (page->ref);
+ free (page->vector);
+ page->vector = NULL;
+
+ ioc_table_lock (table);
+ {
+ table->cache_used -=
+ page->size;
+ }
+ ioc_table_unlock (table);
+ } else {
+ page = ioc_page_create (ioc_inode, 0);
+ }
+
+ dst = CALLOC (1, stbuf->st_size);
+ page->ref = dict_ref (get_new_dict ());
+ page_data = data_from_dynptr (dst,
+ stbuf->st_size);
+ dict_set (page->ref, NULL, page_data);
+
+ src = data_to_ptr (content_data);
+ memcpy (dst, src, stbuf->st_size);
+
+ page->vector = CALLOC (1,
+ sizeof (*page->vector));
+ page->vector->iov_base = dst;
+ page->vector->iov_len = stbuf->st_size;
+ page->count = 1;
+
+ page->waitq = NULL;
+ page->size = stbuf->st_size;
+ page->ready = 1;
+
+ ioc_table_lock (table);
+ {
+ table->cache_used += page->size;
+ }
+ ioc_table_unlock (table);
+
+ } else {
+ if (!(page && page->ready)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "page not present");
+
+ ioc_inode_unlock (ioc_inode);
+ STACK_WIND (frame,
+ ioc_lookup_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup,
+ &local->file_loc,
+ local->xattr_req);
+ return 0;
+ }
+ buf = CALLOC (1, stbuf->st_size);
+ tmp = buf;
+
+ for (i = 0; i < page->count; i++) {
+ memcpy (tmp, page->vector[i].iov_base,
+ page->vector[i].iov_len);
+ tmp += page->vector[i].iov_len;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "serving file %s from cache",
+ local->file_loc.path);
+
+ if (!dict) {
+ need_unref = 1;
+ dict = dict_ref (
+ get_new_dict ());
+ }
+ dict_set (dict, "glusterfs.content",
+ data_from_dynptr (buf,
+ stbuf->st_size));
+ }
+
+ ioc_inode->mtime = stbuf->st_mtime;
+ gettimeofday (&ioc_inode->tv, NULL);
+ }
+ ioc_inode_unlock (ioc_inode);
+
+ if (content_data &&
+ ioc_need_prune (ioc_inode->table)) {
+ ioc_prune (ioc_inode->table);
+ }
+ }
+
+ out:
+ STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, dict);
+
+ if (need_unref) {
+ dict_unref (dict);
+ }
+
+ return 0;
+}
+
+int32_t
+ioc_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ uint64_t content_limit = 0;
+
+ if (GF_FILE_CONTENT_REQUESTED(xattr_req, &content_limit)) {
+ uint64_t tmp_ioc_inode = 0;
+ ioc_inode_t *ioc_inode = NULL;
+ ioc_page_t *page = NULL;
+ ioc_local_t *local = CALLOC (1, sizeof (*local));
+
+ local->need_xattr = content_limit;
+ local->file_loc.path = loc->path;
+ local->file_loc.inode = loc->inode;
+ frame->local = local;
+
+ inode_ctx_get (loc->inode, this, &tmp_ioc_inode);
+ ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+
+ if (ioc_inode) {
+ ioc_inode_lock (ioc_inode);
+ {
+ page = ioc_page_get (ioc_inode, 0);
+ if ((content_limit <=
+ ioc_inode->table->page_size) &&
+ page && page->ready) {
+ local->need_xattr = -1;
+ }
+ }
+ ioc_inode_unlock (ioc_inode);
+ }
+ }
+
+ STACK_WIND (frame,
+ ioc_lookup_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup,
+ loc,
+ xattr_req);
+ return 0;
+}
+
+/*
+ * ioc_forget -
+ *
+ * @frame:
+ * @this:
+ * @inode:
+ *
+ */
+int32_t
+ioc_forget (xlator_t *this,
+ inode_t *inode)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode);
+
+ return 0;
+}
+
+
+/*
+ * ioc_cache_validate_cbk -
+ *
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @buf
+ *
+ */
+int32_t
+ioc_cache_validate_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ ioc_local_t *local = frame->local;
+ ioc_inode_t *ioc_inode = NULL;
+ size_t destroy_size = 0;
+ struct stat *local_stbuf = stbuf;
+
+ ioc_inode = local->inode;
+
+ if ((op_ret == -1) ||
+ ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) {
+ gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
+ "cache for inode(%p) is invalid. flushing all pages",
+ ioc_inode);
+ /* NOTE: only pages with no waiting frames are flushed by
+ * ioc_inode_flush. page_fault will be generated for all
+ * the pages which have waiting frames by ioc_inode_wakeup()
+ */
+ ioc_inode_lock (ioc_inode);
+ {
+ destroy_size = __ioc_inode_flush (ioc_inode);
+ if (op_ret >= 0)
+ ioc_inode->mtime = stbuf->st_mtime;
+ }
+ ioc_inode_unlock (ioc_inode);
+ local_stbuf = NULL;
+ }
+
+ if (destroy_size) {
+ ioc_table_lock (ioc_inode->table);
+ {
+ ioc_inode->table->cache_used -= destroy_size;
+ }
+ ioc_table_unlock (ioc_inode->table);
+ }
+
+ if (op_ret < 0)
+ local_stbuf = NULL;
+
+ ioc_inode_lock (ioc_inode);
+ {
+ gettimeofday (&ioc_inode->tv, NULL);
+ }
+ ioc_inode_unlock (ioc_inode);
+
+ ioc_inode_wakeup (frame, ioc_inode, local_stbuf);
+
+ /* any page-fault initiated by ioc_inode_wakeup() will have its own
+ * fd_ref on fd, safe to unref validate frame's private copy
+ */
+ fd_unref (local->fd);
+
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+static int32_t
+ioc_wait_on_inode (ioc_inode_t *ioc_inode,
+ ioc_page_t *page)
+{
+ ioc_waitq_t *waiter = NULL, *trav = NULL;
+ uint32_t page_found = 0;
+
+ trav = ioc_inode->waitq;
+
+ while (trav) {
+ if (trav->data == page) {
+ page_found = 1;
+ break;
+ }
+ trav = trav->next;
+ }
+
+ if (!page_found) {
+ waiter = CALLOC (1, sizeof (ioc_waitq_t));
+ ERR_ABORT (waiter);
+ waiter->data = page;
+ waiter->next = ioc_inode->waitq;
+ ioc_inode->waitq = waiter;
+ }
+
+ return 0;
+}
+
+/*
+ * ioc_cache_validate -
+ *
+ * @frame:
+ * @ioc_inode:
+ * @fd:
+ *
+ */
+static int32_t
+ioc_cache_validate (call_frame_t *frame,
+ ioc_inode_t *ioc_inode,
+ fd_t *fd,
+ ioc_page_t *page)
+{
+ call_frame_t *validate_frame = NULL;
+ ioc_local_t *validate_local = NULL;
+
+ validate_local = CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (validate_local);
+ validate_frame = copy_frame (frame);
+ validate_local->fd = fd_ref (fd);
+ validate_local->inode = ioc_inode;
+ validate_frame->local = validate_local;
+
+ STACK_WIND (validate_frame,
+ ioc_cache_validate_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->fstat,
+ fd);
+
+ return 0;
+}
+
+static inline uint32_t
+is_match (const char *path,
+ const char *pattern)
+{
+ char *pathname = strdup (path);
+ int32_t ret = 0;
+
+ ret = fnmatch (pattern, path, FNM_NOESCAPE);
+
+ free (pathname);
+
+ return (ret == 0);
+}
+
+static uint32_t
+ioc_get_priority (ioc_table_t *table,
+ const char *path)
+{
+ uint32_t priority = 0;
+ struct ioc_priority *curr = NULL;
+
+ list_for_each_entry (curr, &table->priority_list, list) {
+ if (is_match (path, curr->pattern))
+ priority = curr->priority;
+ }
+
+ return priority;
+}
+
+/*
+ * ioc_open_cbk - open callback for io cache
+ *
+ * @frame: call frame
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @fd:
+ *
+ */
+int32_t
+ioc_open_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd)
+{
+ uint64_t tmp_ioc_inode = 0;
+ ioc_local_t *local = frame->local;
+ ioc_table_t *table = this->private;
+ ioc_inode_t *ioc_inode = NULL;
+ inode_t *inode = local->file_loc.inode;
+ uint32_t weight = 0;
+ const char *path = local->file_loc.path;
+
+ if (op_ret != -1) {
+ /* look for ioc_inode corresponding to this fd */
+ LOCK (&fd->inode->lock);
+ //{
+
+ inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
+ ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+
+ if (!ioc_inode) {
+ /* this is the first time someone is opening this
+ file, assign weight
+ */
+ weight = ioc_get_priority (table, path);
+
+ ioc_inode = ioc_inode_update (table, inode, weight);
+ inode_ctx_put (fd->inode, this,
+ (uint64_t)(long)ioc_inode);
+ } else {
+ ioc_table_lock (ioc_inode->table);
+ //{
+ list_move_tail (&ioc_inode->inode_lru,
+ &table->inode_lru[ioc_inode->weight]);
+ //}
+ ioc_table_unlock (ioc_inode->table);
+ }
+
+ //}
+ UNLOCK (&fd->inode->lock);
+
+ /* If mandatory locking has been enabled on this file,
+ we disable caching on it */
+ if (((inode->st_mode & S_ISGID) &&
+ !(inode->st_mode & S_IXGRP))) {
+ fd_ctx_set (fd, this, 1);
+ }
+
+ /* If O_DIRECT open, we disable caching on it */
+ if ((local->flags & O_DIRECT)){
+ /* O_DIRECT is only for one fd, not the inode
+ * as a whole
+ */
+ fd_ctx_set (fd, this, 1);
+ }
+ }
+
+ FREE (local);
+ frame->local = NULL;
+
+ STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ return 0;
+}
+
+/*
+ * ioc_create_cbk - create callback for io cache
+ *
+ * @frame: call frame
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @fd:
+ * @inode:
+ * @buf:
+ *
+ */
+int32_t
+ioc_create_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct stat *buf)
+{
+ ioc_local_t *local = frame->local;
+ ioc_table_t *table = this->private;
+ ioc_inode_t *ioc_inode = NULL;
+ uint32_t weight = 0;
+ const char *path = local->file_loc.path;
+
+ if (op_ret != -1) {
+ {
+ /* assign weight */
+ weight = ioc_get_priority (table, path);
+
+ ioc_inode = ioc_inode_update (table, inode, weight);
+ LOCK (&fd->inode->lock);
+ {
+ inode_ctx_put (fd->inode, this,
+ (uint64_t)(long)ioc_inode);
+ }
+ UNLOCK (&fd->inode->lock);
+ }
+ /* If mandatory locking has been enabled on this file,
+ we disable caching on it */
+ if ((inode->st_mode & S_ISGID) &&
+ !(inode->st_mode & S_IXGRP)) {
+ fd_ctx_set (fd, this, 1);
+ }
+
+ /* If O_DIRECT open, we disable caching on it */
+ if (local->flags & O_DIRECT){
+ /* O_DIRECT is only for one fd, not the inode
+ * as a whole
+ */
+ fd_ctx_set (fd, this, 1);
+ }
+
+ }
+
+ frame->local = NULL;
+ FREE (local);
+
+ STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+
+ return 0;
+}
+
+/*
+ * ioc_open - open fop for io cache
+ * @frame:
+ * @this:
+ * @loc:
+ * @flags:
+ *
+ */
+int32_t
+ioc_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd)
+{
+
+ ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (local);
+
+ local->flags = flags;
+ local->file_loc.path = loc->path;
+ local->file_loc.inode = loc->inode;
+
+ frame->local = local;
+
+ STACK_WIND (frame,
+ ioc_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open,
+ loc,
+ flags,
+ fd);
+
+ return 0;
+}
+
+/*
+ * ioc_create - create fop for io cache
+ *
+ * @frame:
+ * @this:
+ * @pathname:
+ * @flags:
+ * @mode:
+ *
+ */
+int32_t
+ioc_create (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ fd_t *fd)
+{
+ ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (local);
+
+ local->flags = flags;
+ local->file_loc.path = loc->path;
+ frame->local = local;
+
+ STACK_WIND (frame, ioc_create_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, fd);
+ return 0;
+}
+
+
+
+
+/*
+ * ioc_release - release fop for io cache
+ *
+ * @frame:
+ * @this:
+ * @fd:
+ *
+ */
+int32_t
+ioc_release (xlator_t *this,
+ fd_t *fd)
+{
+ return 0;
+}
+
+/*
+ * ioc_readv_disabled_cbk
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @vector:
+ * @count:
+ *
+ */
+int32_t
+ioc_readv_disabled_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+ return 0;
+}
+
+
+int32_t
+ioc_need_prune (ioc_table_t *table)
+{
+ int64_t cache_difference = 0;
+
+ ioc_table_lock (table);
+ {
+ cache_difference = table->cache_used - table->cache_size;
+ }
+ ioc_table_unlock (table);
+
+ if (cache_difference > 0)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * dispatch_requests -
+ *
+ * @frame:
+ * @inode:
+ *
+ *
+ */
+static void
+dispatch_requests (call_frame_t *frame,
+ ioc_inode_t *ioc_inode,
+ fd_t *fd,
+ off_t offset,
+ size_t size)
+{
+ ioc_local_t *local = frame->local;
+ ioc_table_t *table = ioc_inode->table;
+ ioc_page_t *trav = NULL;
+ ioc_waitq_t *waitq = NULL;
+ off_t rounded_offset = 0;
+ off_t rounded_end = 0;
+ off_t trav_offset = 0;
+ int32_t fault = 0;
+ int8_t need_validate = 0;
+ int8_t might_need_validate = 0; /* if a page exists, do we need
+ to validate it? */
+
+ rounded_offset = floor (offset, table->page_size);
+ rounded_end = roof (offset + size, table->page_size);
+ trav_offset = rounded_offset;
+
+ /* once a frame does read, it should be waiting on something */
+ local->wait_count++;
+
+ /* Requested region can fall in three different pages,
+ * 1. Ready - region is already in cache, we just have to serve it.
+ * 2. In-transit - page fault has been generated on this page, we need
+ * to wait till the page is ready
+ * 3. Fault - page is not in cache, we have to generate a page fault
+ */
+
+ might_need_validate = ioc_inode_need_revalidate (ioc_inode);
+
+ while (trav_offset < rounded_end) {
+ size_t trav_size = 0;
+ off_t local_offset = 0;
+
+ ioc_inode_lock (ioc_inode);
+ //{
+
+ /* look for requested region in the cache */
+ trav = ioc_page_get (ioc_inode, trav_offset);
+
+ local_offset = max (trav_offset, offset);
+ trav_size = min (((offset+size) - local_offset),
+ table->page_size);
+
+ if (!trav) {
+ /* page not in cache, we need to generate page fault */
+ trav = ioc_page_create (ioc_inode, trav_offset);
+ fault = 1;
+ if (!trav) {
+ gf_log (frame->this->name, GF_LOG_CRITICAL,
+ "ioc_page_create returned NULL");
+ }
+ }
+
+ ioc_wait_on_page (trav, frame, local_offset, trav_size);
+
+ if (trav->ready) {
+ /* page found in cache */
+ if (!might_need_validate) {
+ /* fresh enough */
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "cache hit for trav_offset=%"PRId64""
+ "/local_offset=%"PRId64"",
+ trav_offset, local_offset);
+ waitq = ioc_page_wakeup (trav);
+ } else {
+ /* if waitq already exists, fstat revalidate is
+ already on the way */
+ if (!ioc_inode->waitq) {
+ need_validate = 1;
+ }
+ ioc_wait_on_inode (ioc_inode, trav);
+ }
+ }
+
+ //}
+ ioc_inode_unlock (ioc_inode);
+
+ ioc_waitq_return (waitq);
+ waitq = NULL;
+
+ if (fault) {
+ fault = 0;
+ /* new page created, increase the table->cache_used */
+ ioc_page_fault (ioc_inode, frame, fd, trav_offset);
+ }
+
+ if (need_validate) {
+ need_validate = 0;
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "sending validate request for "
+ "inode(%"PRId64") at offset=%"PRId64"",
+ fd->inode->ino, trav_offset);
+ ioc_cache_validate (frame, ioc_inode, fd, trav);
+ }
+
+ trav_offset += table->page_size;
+ }
+
+ ioc_frame_return (frame);
+
+ if (ioc_need_prune (ioc_inode->table)) {
+ ioc_prune (ioc_inode->table);
+ }
+
+ return;
+}
+
+
+/*
+ * ioc_readv -
+ *
+ * @frame:
+ * @this:
+ * @fd:
+ * @size:
+ * @offset:
+ *
+ */
+int32_t
+ioc_readv (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset)
+{
+ uint64_t tmp_ioc_inode = 0;
+ ioc_inode_t *ioc_inode = NULL;
+ ioc_local_t *local = NULL;
+ uint32_t weight = 0;
+
+ inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
+ ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+ if (!ioc_inode) {
+ /* caching disabled, go ahead with normal readv */
+ STACK_WIND (frame,
+ ioc_readv_disabled_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->readv,
+ fd,
+ size,
+ offset);
+ return 0;
+ }
+
+ if (!fd_ctx_get (fd, this, NULL)) {
+ /* disable caching for this fd, go ahead with normal readv */
+ STACK_WIND (frame,
+ ioc_readv_disabled_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->readv,
+ fd,
+ size,
+ offset);
+ return 0;
+ }
+
+ local = (ioc_local_t *) CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (local);
+ INIT_LIST_HEAD (&local->fill_list);
+
+ frame->local = local;
+ local->pending_offset = offset;
+ local->pending_size = size;
+ local->offset = offset;
+ local->size = size;
+ local->inode = ioc_inode;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"",
+ frame, offset, size);
+
+ weight = ioc_inode->weight;
+
+ ioc_table_lock (ioc_inode->table);
+ {
+ list_move_tail (&ioc_inode->inode_lru,
+ &ioc_inode->table->inode_lru[weight]);
+ }
+ ioc_table_unlock (ioc_inode->table);
+
+ dispatch_requests (frame, ioc_inode, fd, offset, size);
+
+ return 0;
+}
+
+/*
+ * ioc_writev_cbk -
+ *
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ *
+ */
+int32_t
+ioc_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *stbuf)
+{
+ ioc_local_t *local = frame->local;
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (local->fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+ return 0;
+}
+
+/*
+ * ioc_writev
+ *
+ * @frame:
+ * @this:
+ * @fd:
+ * @vector:
+ * @count:
+ * @offset:
+ *
+ */
+int32_t
+ioc_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset)
+{
+ ioc_local_t *local = NULL;
+ uint64_t ioc_inode = 0;
+
+ local = CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (local);
+
+ /* TODO: why is it not fd_ref'ed */
+ local->fd = fd;
+ frame->local = local;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND (frame,
+ ioc_writev_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd,
+ vector,
+ count,
+ offset);
+
+ return 0;
+}
+
+/*
+ * ioc_truncate_cbk -
+ *
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @buf:
+ *
+ */
+int32_t
+ioc_truncate_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct stat *buf)
+{
+
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ return 0;
+}
+
+/*
+ * ioc_truncate -
+ *
+ * @frame:
+ * @this:
+ * @loc:
+ * @offset:
+ *
+ */
+int32_t
+ioc_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset)
+{
+ uint64_t ioc_inode = 0;
+ inode_ctx_get (loc->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND (frame,
+ ioc_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate,
+ loc,
+ offset);
+ return 0;
+}
+
+/*
+ * ioc_ftruncate -
+ *
+ * @frame:
+ * @this:
+ * @fd:
+ * @offset:
+ *
+ */
+int32_t
+ioc_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset)
+{
+ uint64_t ioc_inode = 0;
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND (frame,
+ ioc_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ fd,
+ offset);
+ return 0;
+}
+
+int32_t
+ioc_lk_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct flock *lock)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, lock);
+ return 0;
+}
+
+int32_t
+ioc_lk (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t cmd,
+ struct flock *lock)
+{
+ ioc_inode_t *ioc_inode = NULL;
+ uint64_t tmp_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &tmp_inode);
+ ioc_inode = (ioc_inode_t *)(long)tmp_inode;
+ if (!ioc_inode) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "inode context is NULL: returning EBADFD");
+ STACK_UNWIND (frame, -1, EBADFD, NULL);
+ return 0;
+ }
+
+ ioc_inode_lock (ioc_inode);
+ {
+ gettimeofday (&ioc_inode->tv, NULL);
+ }
+ ioc_inode_unlock (ioc_inode);
+
+ STACK_WIND (frame, ioc_lk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lk, fd, cmd, lock);
+ return 0;
+}
+
+int32_t
+ioc_get_priority_list (const char *opt_str, struct list_head *first)
+{
+ int32_t max_pri = 0;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *tmp_str2 = NULL;
+ char *dup_str = NULL;
+ char *stripe_str = NULL;
+ char *pattern = NULL;
+ char *priority = NULL;
+ char *string = strdup (opt_str);
+ struct ioc_priority *curr = NULL;
+
+ /* Get the pattern for cache priority.
+ * "option priority *.jpg:1,abc*:2" etc
+ */
+ /* TODO: inode_lru in table is statically hard-coded to 5,
+ * should be changed to run-time configuration
+ */
+ stripe_str = strtok_r (string, ",", &tmp_str);
+ while (stripe_str) {
+ curr = CALLOC (1, sizeof (struct ioc_priority));
+ ERR_ABORT (curr);
+ list_add_tail (&curr->list, first);
+
+ dup_str = strdup (stripe_str);
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ if (!pattern)
+ return -1;
+ priority = strtok_r (NULL, ":", &tmp_str1);
+ if (!priority)
+ return -1;
+ gf_log ("io-cache",
+ GF_LOG_DEBUG,
+ "ioc priority : pattern %s : priority %s",
+ pattern,
+ priority);
+ curr->pattern = strdup (pattern);
+ curr->priority = strtol (priority, &tmp_str2, 0);
+ if (tmp_str2 && (*tmp_str2))
+ return -1;
+ else
+ max_pri = max (max_pri, curr->priority);
+ stripe_str = strtok_r (NULL, ",", &tmp_str);
+ }
+
+ return max_pri;
+}
+
+/*
+ * init -
+ * @this:
+ *
+ */
+int32_t
+init (xlator_t *this)
+{
+ ioc_table_t *table;
+ dict_t *options = this->options;
+ uint32_t index = 0;
+ char *page_size_string = NULL;
+ char *cache_size_string = NULL;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: io-cache not configured with exactly "
+ "one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ table = (void *) CALLOC (1, sizeof (*table));
+ ERR_ABORT (table);
+
+ table->xl = this;
+ table->page_size = IOC_PAGE_SIZE;
+ table->cache_size = IOC_CACHE_SIZE;
+
+ if (dict_get (options, "page-size"))
+ page_size_string = data_to_str (dict_get (options,
+ "page-size"));
+
+ if (page_size_string) {
+ if (gf_string2bytesize (page_size_string,
+ &table->page_size) != 0) {
+ gf_log ("io-cache", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "\"option page-size\"",
+ page_size_string);
+ return -1;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "using page-size %"PRIu64"", table->page_size);
+ }
+
+ if (dict_get (options, "cache-size"))
+ cache_size_string = data_to_str (dict_get (options,
+ "cache-size"));
+ if (cache_size_string) {
+ if (gf_string2bytesize (cache_size_string,
+ &table->cache_size) != 0) {
+ gf_log ("io-cache", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "\"option cache-size\"",
+ cache_size_string);
+ return -1;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "using cache-size %"PRIu64"", table->cache_size);
+ }
+
+ table->cache_timeout = 1;
+
+ if (dict_get (options, "cache-timeout")) {
+ table->cache_timeout =
+ data_to_uint32 (dict_get (options,
+ "cache-timeout"));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Using %d seconds to revalidate cache",
+ table->cache_timeout);
+ }
+
+ INIT_LIST_HEAD (&table->priority_list);
+ if (dict_get (options, "priority")) {
+ char *option_list = data_to_str (dict_get (options,
+ "priority"));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "option path %s", option_list);
+ /* parse the list of pattern:priority */
+ table->max_pri = ioc_get_priority_list (option_list,
+ &table->priority_list);
+
+ if (table->max_pri == -1)
+ return -1;
+ }
+ table->max_pri ++;
+ INIT_LIST_HEAD (&table->inodes);
+
+ table->inode_lru = CALLOC (table->max_pri, sizeof (struct list_head));
+ ERR_ABORT (table->inode_lru);
+ for (index = 0; index < (table->max_pri); index++)
+ INIT_LIST_HEAD (&table->inode_lru[index]);
+
+ pthread_mutex_init (&table->table_lock, NULL);
+ this->private = table;
+ return 0;
+}
+
+/*
+ * fini -
+ *
+ * @this:
+ *
+ */
+void
+fini (xlator_t *this)
+{
+ ioc_table_t *table = this->private;
+
+ pthread_mutex_destroy (&table->table_lock);
+ FREE (table);
+
+ this->private = NULL;
+ return;
+}
+
+struct xlator_fops fops = {
+ .open = ioc_open,
+ .create = ioc_create,
+ .readv = ioc_readv,
+ .writev = ioc_writev,
+ .truncate = ioc_truncate,
+ .ftruncate = ioc_ftruncate,
+ .utimens = ioc_utimens,
+ .lookup = ioc_lookup,
+ .lk = ioc_lk
+};
+
+struct xlator_mops mops = {
+};
+
+struct xlator_cbks cbks = {
+ .forget = ioc_forget,
+ .release = ioc_release
+};
+
+struct volume_options options[] = {
+ { .key = {"priority"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = {"cache-timeout", "force-revalidate-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 60
+ },
+ { .key = {"page-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 16 * GF_UNIT_KB,
+ .max = 4 * GF_UNIT_MB
+ },
+ { .key = {"cache-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 4 * GF_UNIT_MB,
+ .max = 6 * GF_UNIT_GB
+ },
+ { .key = {NULL} },
+};