summaryrefslogtreecommitdiffstats
path: root/xlators/performance/io-cache/src/page.c
diff options
context:
space:
mode:
authorVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
committerVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
commit77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree02e155a5753b398ee572b45793f889b538efab6b /xlators/performance/io-cache/src/page.c
parentf3b2e6580e5663292ee113c741343c8a43ee133f (diff)
Added all files
Diffstat (limited to 'xlators/performance/io-cache/src/page.c')
-rw-r--r--xlators/performance/io-cache/src/page.c778
1 files changed, 778 insertions, 0 deletions
diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c
new file mode 100644
index 000000000..e549f0bb5
--- /dev/null
+++ b/xlators/performance/io-cache/src/page.c
@@ -0,0 +1,778 @@
+/*
+ Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "io-cache.h"
+#include <assert.h>
+#include <sys/time.h>
+
+ioc_page_t *
+ioc_page_get (ioc_inode_t *ioc_inode,
+ off_t offset)
+{
+ int8_t found = 0;
+ ioc_page_t *page = NULL;
+ ioc_table_t *table = ioc_inode->table;
+ off_t rounded_offset = floor (offset, table->page_size);
+
+ if (list_empty (&ioc_inode->pages)) {
+ return NULL;
+ }
+
+ list_for_each_entry (page, &ioc_inode->pages, pages) {
+ if (page->offset == rounded_offset) {
+ found = 1;
+ break;
+ }
+ }
+
+ /* was previously returning ioc_inode itself..,
+ * 1st of its type and found one more downstairs :O */
+ if (!found){
+ page = NULL;
+ } else {
+ /* push the page to the end of the lru list */
+ list_move_tail (&page->page_lru, &ioc_inode->page_lru);
+ }
+
+ return page;
+}
+
+
+/*
+ * ioc_page_destroy -
+ *
+ * @page:
+ *
+ */
+int64_t
+ioc_page_destroy (ioc_page_t *page)
+{
+ int64_t page_size = 0;
+
+ page_size = page->size;
+
+ if (page->waitq) {
+ /* frames waiting on this page, do not destroy this page */
+ page_size = -1;
+ } else {
+
+ list_del (&page->pages);
+ list_del (&page->page_lru);
+
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
+ "destroying page = %p, offset = %"PRId64" "
+ "&& inode = %p",
+ page, page->offset, page->inode);
+
+ if (page->vector){
+ dict_unref (page->ref);
+ free (page->vector);
+ page->vector = NULL;
+ }
+
+ page->inode = NULL;
+
+ }
+
+ if (page_size != -1) {
+ pthread_mutex_destroy (&page->page_lock);
+ free (page);
+ }
+
+ return page_size;
+}
+
+/*
+ * ioc_prune - prune the cache. we have a limit to the number of pages we
+ * can have in-memory.
+ *
+ * @table: ioc_table_t of this translator
+ *
+ */
+int32_t
+ioc_prune (ioc_table_t *table)
+{
+ ioc_inode_t *curr = NULL, *next_ioc_inode = NULL;
+ ioc_page_t *page = NULL, *next = NULL;
+ int32_t ret = -1;
+ int32_t index = 0;
+ uint64_t size_to_prune = 0;
+ uint64_t size_pruned = 0;
+
+ ioc_table_lock (table);
+ {
+ size_to_prune = table->cache_used - table->cache_size;
+ /* take out the least recently used inode */
+ for (index=0; index < table->max_pri; index++) {
+ list_for_each_entry_safe (curr, next_ioc_inode,
+ &table->inode_lru[index],
+ inode_lru) {
+ /* prune page-by-page for this inode, till
+ * we reach the equilibrium */
+ ioc_inode_lock (curr);
+ /* { */
+
+ list_for_each_entry_safe (page, next,
+ &curr->page_lru,
+ page_lru) {
+ /* done with all pages, and not
+ * reached equilibrium yet??
+ * continue with next inode in
+ * lru_list */
+ size_pruned += page->size;
+ ret = ioc_page_destroy (page);
+
+ if (ret != -1)
+ table->cache_used -= ret;
+
+ gf_log (table->xl->name,
+ GF_LOG_DEBUG,
+ "index = %d && table->cache_"
+ "used = %"PRIu64" && table->"
+ "cache_size = %"PRIu64,
+ index, table->cache_used,
+ table->cache_size);
+
+ if (size_pruned >= size_to_prune)
+ break;
+ } /* list_for_each_entry_safe(page...) */
+ if (list_empty (&curr->pages)) {
+ list_del_init (&curr->inode_lru);
+ }
+
+ /* } */
+ ioc_inode_unlock (curr);
+
+ if (size_pruned >= size_to_prune)
+ break;
+ } /* list_for_each_entry_safe (curr...) */
+
+ if (size_pruned >= size_to_prune)
+ break;
+ } /* for(index=0;...) */
+
+ } /* ioc_inode_table locked region end */
+ ioc_table_unlock (table);
+
+ return 0;
+}
+
+/*
+ * ioc_page_create - create a new page.
+ *
+ * @ioc_inode:
+ * @offset:
+ *
+ */
+ioc_page_t *
+ioc_page_create (ioc_inode_t *ioc_inode,
+ off_t offset)
+{
+ ioc_table_t *table = ioc_inode->table;
+ ioc_page_t *page = NULL;
+ off_t rounded_offset = floor (offset, table->page_size);
+ ioc_page_t *newpage = CALLOC (1, sizeof (*newpage));
+ ERR_ABORT (newpage);
+
+ if (ioc_inode)
+ table = ioc_inode->table;
+ else {
+ return NULL;
+ }
+
+ newpage->offset = rounded_offset;
+ newpage->inode = ioc_inode;
+ pthread_mutex_init (&newpage->page_lock, NULL);
+
+ list_add_tail (&newpage->page_lru, &ioc_inode->page_lru);
+ list_add_tail (&newpage->pages, &ioc_inode->pages);
+
+ page = newpage;
+
+ gf_log ("io-cache", GF_LOG_DEBUG,
+ "returning new page %p", page);
+ return page;
+}
+
+/*
+ * ioc_wait_on_page - pause a frame to wait till the arrival of a page.
+ * here we need to handle the case when the frame who calls wait_on_page
+ * himself has caused page_fault
+ *
+ * @page: page to wait on
+ * @frame: call frame who is waiting on page
+ *
+ */
+void
+ioc_wait_on_page (ioc_page_t *page,
+ call_frame_t *frame,
+ off_t offset,
+ size_t size)
+{
+ ioc_waitq_t *waitq = NULL;
+ ioc_local_t *local = frame->local;
+
+ waitq = CALLOC (1, sizeof (*waitq));
+ ERR_ABORT (waitq);
+
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "frame(%p) waiting on page = %p, offset=%"PRId64", "
+ "size=%"GF_PRI_SIZET"",
+ frame, page, offset, size);
+
+ waitq->data = frame;
+ waitq->next = page->waitq;
+ waitq->pending_offset = offset;
+ waitq->pending_size = size;
+ page->waitq = waitq;
+ /* one frame can wait only once on a given page,
+ * local->wait_count is number of pages a frame is waiting on */
+ ioc_local_lock (local);
+ {
+ local->wait_count++;
+ }
+ ioc_local_unlock (local);
+}
+
+
+/*
+ * ioc_cache_still_valid - see if cached pages ioc_inode are still valid
+ * against given stbuf
+ *
+ * @ioc_inode:
+ * @stbuf:
+ *
+ * assumes ioc_inode is locked
+ */
+int8_t
+ioc_cache_still_valid (ioc_inode_t *ioc_inode,
+ struct stat *stbuf)
+{
+ int8_t cache_still_valid = 1;
+
+#if 0
+ if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime) ||
+ (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec))
+ cache_still_valid = 0;
+
+#else
+ if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime))
+ cache_still_valid = 0;
+
+#endif
+
+#if 0
+ /* talk with avati@zresearch.com to enable this section */
+ if (!ioc_inode->mtime && stbuf) {
+ cache_still_valid = 1;
+ ioc_inode->mtime = stbuf->st_mtime;
+ }
+#endif
+
+ return cache_still_valid;
+}
+
+
+void
+ioc_waitq_return (ioc_waitq_t *waitq)
+{
+ ioc_waitq_t *trav = NULL;
+ ioc_waitq_t *next = NULL;
+ call_frame_t *frame = NULL;
+
+ for (trav = waitq; trav; trav = next) {
+ next = trav->next;
+
+ frame = trav->data;
+ ioc_frame_return (frame);
+ free (trav);
+ }
+}
+
+
+int
+ioc_fault_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vector,
+ int32_t count,
+ struct stat *stbuf)
+{
+ ioc_local_t *local = frame->local;
+ off_t offset = local->pending_offset;
+ ioc_inode_t *ioc_inode = local->inode;
+ ioc_table_t *table = ioc_inode->table;
+ ioc_page_t *page = NULL;
+ off_t trav_offset = 0;
+ size_t payload_size = 0;
+ int32_t destroy_size = 0;
+ size_t page_size = 0;
+ ioc_waitq_t *waitq = NULL;
+
+ trav_offset = offset;
+ payload_size = op_ret;
+
+ ioc_inode_lock (ioc_inode);
+ {
+ if (op_ret == -1 ||
+ (op_ret >= 0 &&
+ !ioc_cache_still_valid(ioc_inode, stbuf))) {
+ gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
+ "cache for inode(%p) is invalid. flushing "
+ "all pages", ioc_inode);
+ destroy_size = __ioc_inode_flush (ioc_inode);
+ }
+
+ if (op_ret >= 0)
+ ioc_inode->mtime = stbuf->st_mtime;
+
+ gettimeofday (&ioc_inode->tv, NULL);
+
+ if (op_ret < 0) {
+ /* error, readv returned -1 */
+ page = ioc_page_get (ioc_inode, offset);
+ if (page)
+ waitq = ioc_page_error (page, op_ret,
+ op_errno);
+ } else {
+ gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
+ "op_ret = %d", op_ret);
+ page = ioc_page_get (ioc_inode, offset);
+ if (!page) {
+ /* page was flushed */
+ /* some serious bug ? */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "wasted copy: %"PRId64"[+%"PRId64"] "
+ "ioc_inode=%p", offset,
+ table->page_size, ioc_inode);
+ } else {
+ if (page->vector) {
+ dict_unref (page->ref);
+ free (page->vector);
+ page->vector = NULL;
+ }
+
+ /* keep a copy of the page for our cache */
+ page->vector = iov_dup (vector, count);
+ page->count = count;
+ if (frame->root->rsp_refs) {
+ dict_ref (frame->root->rsp_refs);
+ page->ref = frame->root->rsp_refs;
+ } else {
+ /* TODO: we have got a response to
+ * our request and no data */
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "frame>root>rsp_refs is null");
+ } /* if(frame->root->rsp_refs) */
+
+ /* page->size should indicate exactly how
+ * much the readv call to the child
+ * translator returned. earlier op_ret
+ * from child translator was used, which
+ * gave rise to a bug where reads from
+ * io-cached volume were resulting in 0
+ * byte replies */
+ page_size = iov_length(vector, count);
+
+ page->size = page_size;
+
+ if (page->waitq) {
+ /* wake up all the frames waiting on
+ * this page, including
+ * the frame which triggered fault */
+ waitq = ioc_page_wakeup (page);
+ } /* if(page->waitq) */
+ } /* if(!page)...else */
+ } /* if(op_ret < 0)...else */
+ } /* ioc_inode locked region end */
+ ioc_inode_unlock (ioc_inode);
+
+ ioc_waitq_return (waitq);
+
+ if (page_size) {
+ ioc_table_lock (table);
+ {
+ table->cache_used += page_size;
+ }
+ ioc_table_unlock (table);
+ }
+
+ if (destroy_size) {
+ ioc_table_lock (table);
+ {
+ table->cache_used -= destroy_size;
+ }
+ ioc_table_unlock (table);
+ }
+
+ if (ioc_need_prune (ioc_inode->table)) {
+ ioc_prune (ioc_inode->table);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "fault frame %p returned", frame);
+ pthread_mutex_destroy (&local->local_lock);
+
+ fd_unref (local->fd);
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/*
+ * ioc_page_fault -
+ *
+ * @ioc_inode:
+ * @frame:
+ * @fd:
+ * @offset:
+ *
+ */
+void
+ioc_page_fault (ioc_inode_t *ioc_inode,
+ call_frame_t *frame,
+ fd_t *fd,
+ off_t offset)
+{
+ ioc_table_t *table = ioc_inode->table;
+ call_frame_t *fault_frame = copy_frame (frame);
+ ioc_local_t *fault_local = CALLOC (1, sizeof (ioc_local_t));
+ ERR_ABORT (fault_local);
+
+ /* NOTE: copy_frame() means, the frame the fop whose fd_ref we
+ * are using till now won't be valid till we get reply from server.
+ * we unref this fd, in fault_cbk */
+ fault_local->fd = fd_ref (fd);
+
+ fault_frame->local = fault_local;
+ pthread_mutex_init (&fault_local->local_lock, NULL);
+
+ INIT_LIST_HEAD (&fault_local->fill_list);
+ fault_local->pending_offset = offset;
+ fault_local->pending_size = table->page_size;
+ fault_local->inode = ioc_inode;
+
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "stack winding page fault for offset = %"PRId64" with "
+ "frame %p", offset, fault_frame);
+
+ STACK_WIND (fault_frame, ioc_fault_cbk,
+ FIRST_CHILD(fault_frame->this),
+ FIRST_CHILD(fault_frame->this)->fops->readv,
+ fd, table->page_size, offset);
+ return;
+}
+
+void
+ioc_frame_fill (ioc_page_t *page,
+ call_frame_t *frame,
+ off_t offset,
+ size_t size)
+{
+ ioc_local_t *local = frame->local;
+ ioc_fill_t *fill = NULL;
+ off_t src_offset = 0;
+ off_t dst_offset = 0;
+ ssize_t copy_size = 0;
+ ioc_inode_t *ioc_inode = page->inode;
+
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" "
+ "&& page->size = %"GF_PRI_SIZET" && wait_count = %d",
+ frame, offset, size, page->size, local->wait_count);
+
+ /* immediately move this page to the end of the page_lru list */
+ list_move_tail (&page->page_lru, &ioc_inode->page_lru);
+ /* fill local->pending_size bytes from local->pending_offset */
+ if (local->op_ret != -1 && page->size) {
+ if (offset > page->offset)
+ /* offset is offset in file, convert it to offset in
+ * page */
+ src_offset = offset - page->offset;
+ /*FIXME: since offset is the offset within page is the
+ * else case valid? */
+ else
+ /* local->pending_offset is in previous page. do not
+ * fill until we have filled all previous pages */
+ dst_offset = page->offset - offset;
+
+ /* we have to copy from offset to either end of this page
+ * or till the requested size */
+ copy_size = min (page->size - src_offset,
+ size - dst_offset);
+
+ if (copy_size < 0) {
+ /* if page contains fewer bytes and the required offset
+ is beyond the page size in the page */
+ copy_size = src_offset = 0;
+ }
+
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
+ "copy_size = %"GF_PRI_SIZET" && src_offset = "
+ "%"PRId64" && dst_offset = %"PRId64"",
+ copy_size, src_offset, dst_offset);
+
+ {
+ ioc_fill_t *new = CALLOC (1, sizeof (*new));
+ ERR_ABORT (new);
+ new->offset = page->offset;
+ new->size = copy_size;
+ new->refs = dict_ref (page->ref);
+ new->count = iov_subset (page->vector,
+ page->count,
+ src_offset,
+ src_offset + copy_size,
+ NULL);
+ new->vector = CALLOC (new->count,
+ sizeof (struct iovec));
+ ERR_ABORT (new->vector);
+ new->count = iov_subset (page->vector,
+ page->count,
+ src_offset,
+ src_offset + copy_size,
+ new->vector);
+
+
+
+ /* add the ioc_fill to fill_list for this frame */
+ if (list_empty (&local->fill_list)) {
+ /* if list is empty, then this is the first
+ * time we are filling frame, add the
+ * ioc_fill_t to the end of list */
+ list_add_tail (&new->list, &local->fill_list);
+ } else {
+ int8_t found = 0;
+ /* list is not empty, we need to look for
+ * where this offset fits in list */
+ list_for_each_entry (fill, &local->fill_list,
+ list) {
+ if (fill->offset > new->offset) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ found = 0;
+ list_add_tail (&new->list,
+ &fill->list);
+ } else {
+ list_add_tail (&new->list,
+ &local->fill_list);
+ }
+ }
+ }
+ local->op_ret += copy_size;
+ }
+}
+
+/*
+ * ioc_frame_unwind - frame unwinds only from here
+ *
+ * @frame: call frame to unwind
+ *
+ * to be used only by ioc_frame_return(), when a frame has
+ * finished waiting on all pages, required
+ *
+ */
+static void
+ioc_frame_unwind (call_frame_t *frame)
+{
+ ioc_local_t *local = frame->local;
+ ioc_fill_t *fill = NULL, *next = NULL;
+ int32_t count = 0;
+ struct iovec *vector = NULL;
+ int32_t copied = 0;
+ dict_t *refs = NULL;
+ struct stat stbuf = {0,};
+ int32_t op_ret = 0;
+
+ // ioc_local_lock (local);
+ refs = get_new_dict ();
+
+ frame->local = NULL;
+
+ if (list_empty (&local->fill_list)) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "frame(%p) has 0 entries in local->fill_list "
+ "(offset = %"PRId64" && size = %"GF_PRI_SIZET")",
+ frame, local->offset, local->size);
+ }
+
+ list_for_each_entry (fill, &local->fill_list, list) {
+ count += fill->count;
+ }
+
+ vector = CALLOC (count, sizeof (*vector));
+ ERR_ABORT (vector);
+
+ list_for_each_entry_safe (fill, next, &local->fill_list, list) {
+ memcpy (((char *)vector) + copied,
+ fill->vector,
+ fill->count * sizeof (*vector));
+
+ copied += (fill->count * sizeof (*vector));
+
+ dict_copy (fill->refs, refs);
+
+ list_del (&fill->list);
+ dict_unref (fill->refs);
+ free (fill->vector);
+ free (fill);
+ }
+
+ frame->root->rsp_refs = dict_ref (refs);
+
+ op_ret = iov_length (vector, count);
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "frame(%p) unwinding with op_ret=%d", frame, op_ret);
+
+ // ioc_local_unlock (local);
+
+ STACK_UNWIND (frame,
+ op_ret,
+ local->op_errno,
+ vector,
+ count,
+ &stbuf);
+
+ dict_unref (refs);
+
+ pthread_mutex_destroy (&local->local_lock);
+ free (local);
+ free (vector);
+
+ return;
+}
+
+/*
+ * ioc_frame_return -
+ * @frame:
+ *
+ * to be called only when a frame is waiting on an in-transit page
+ */
+void
+ioc_frame_return (call_frame_t *frame)
+{
+ ioc_local_t *local = frame->local;
+ int32_t wait_count;
+ assert (local->wait_count > 0);
+
+ ioc_local_lock (local);
+ {
+ wait_count = --local->wait_count;
+ }
+ ioc_local_unlock (local);
+
+ if (!wait_count) {
+ ioc_frame_unwind (frame);
+ }
+
+ return;
+}
+
+/*
+ * ioc_page_wakeup -
+ * @page:
+ *
+ * to be called only when a frame is waiting on an in-transit page
+ */
+ioc_waitq_t *
+ioc_page_wakeup (ioc_page_t *page)
+{
+ ioc_waitq_t *waitq = NULL, *trav = NULL;
+ call_frame_t *frame = NULL;
+
+ waitq = page->waitq;
+ page->waitq = NULL;
+
+ trav = waitq;
+ page->ready = 1;
+
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
+ "page is %p && waitq = %p", page, waitq);
+
+ for (trav = waitq; trav; trav = trav->next) {
+ frame = trav->data;
+ ioc_frame_fill (page, frame, trav->pending_offset,
+ trav->pending_size);
+ }
+
+ return waitq;
+}
+
+
+/*
+ * ioc_page_error -
+ * @page:
+ * @op_ret:
+ * @op_errno:
+ *
+ */
+ioc_waitq_t *
+ioc_page_error (ioc_page_t *page,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ ioc_waitq_t *waitq = NULL, *trav = NULL;
+ call_frame_t *frame = NULL;
+ int64_t ret = 0;
+ ioc_table_t *table = NULL;
+ ioc_local_t *local = NULL;
+
+ waitq = page->waitq;
+ page->waitq = NULL;
+
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
+ "page error for page = %p & waitq = %p", page, waitq);
+
+ for (trav = waitq; trav; trav = trav->next) {
+
+ frame = trav->data;
+
+ local = frame->local;
+ ioc_local_lock (local);
+ {
+ if (local->op_ret != -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ ioc_local_unlock (local);
+ }
+
+ table = page->inode->table;
+ ret = ioc_page_destroy (page);
+
+ if (ret != -1) {
+ table->cache_used -= ret;
+ }
+
+ return waitq;
+}