diff options
Diffstat (limited to 'xlators/performance/read-ahead/src')
-rw-r--r-- | xlators/performance/read-ahead/src/Makefile.am | 14 | ||||
-rw-r--r-- | xlators/performance/read-ahead/src/page.c | 487 | ||||
-rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.c | 890 | ||||
-rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.h | 194 |
4 files changed, 1585 insertions, 0 deletions
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am new file mode 100644 index 00000000000..7bb90228227 --- /dev/null +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = read-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +read_ahead_la_LDFLAGS = -module -avoidversion + +read_ahead_la_SOURCES = read-ahead.c page.c +read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = read-ahead.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c new file mode 100644 index 00000000000..3b8d4d2093e --- /dev/null +++ b/xlators/performance/read-ahead/src/page.c @@ -0,0 +1,487 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> + + +ra_page_t * +ra_page_get (ra_file_t *file, + off_t offset) +{ + ra_page_t *page = NULL; + off_t rounded_offset = 0; + + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); + + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; + + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; + + return page; +} + + +ra_page_t * +ra_page_create (ra_file_t *file, off_t offset) +{ + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; + + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); + + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; + + if (page == &file->pages || page->offset != rounded_offset) { + newpage = CALLOC (1, sizeof (*newpage)); + if (!newpage) + return NULL; + + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; + + page = newpage; + } + + return page; +} + + +void +ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +{ + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; + + + local = frame->local; + waitq = CALLOC (1, sizeof (*waitq)); + if (!waitq) { + gf_log (frame->this->name, GF_LOG_ERROR, + "out of memory :("); + return; + } + + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; + + ra_local_lock (local); + { + local->wait_count++; + } + ra_local_unlock (local); +} + + +void +ra_waitq_return (ra_waitq_t *waitq) +{ + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; + + for (trav = waitq; trav; trav = next) { + next = trav->next; + + frame = trav->data; + ra_frame_return (frame); + free (trav); + } +} + + +int +ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct stat *stbuf) +{ + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + off_t trav_offset = 0; + size_t payload_size = 0; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + local = frame->local; + fd = local->fd; + + ret = fd_ctx_get (fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + trav_offset = pending_offset; + payload_size = op_ret; + + ra_file_lock (file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + if (op_ret < 0) { + page = ra_page_get (file, pending_offset); + if (page) + waitq = ra_page_error (page, op_ret, op_errno); + goto unlock; + } + + page = ra_page_get (file, pending_offset); + if (!page) { + gf_log (this->name, GF_LOG_DEBUG, + "wasted copy: %"PRId64"[+%"PRId64"] file=%p", + pending_offset, file->page_size, file); + goto unlock; + } + + if (page->vector) { + dict_unref (page->ref); + free (page->vector); + } + + page->vector = iov_dup (vector, count); + page->count = count; + page->ref = dict_ref (frame->root->rsp_refs); + page->ready = 1; + + page->size = iov_length (vector, count); + + waitq = ra_page_wakeup (page); + } +unlock: + ra_file_unlock (file); + + ra_waitq_return (waitq); + + fd_unref (local->fd); + + free (frame->local); + frame->local = NULL; + + STACK_DESTROY (frame->root); + return 0; +} + + +void +ra_page_fault (ra_file_t *file, + call_frame_t *frame, + off_t offset) +{ + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + + fault_frame = copy_frame (frame); + fault_local = CALLOC (1, sizeof (ra_local_t)); + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref (file->fd); + + STACK_WIND (fault_frame, ra_fault_cbk, + FIRST_CHILD (fault_frame->this), + FIRST_CHILD (fault_frame->this)->fops->readv, + file->fd, file->page_size, offset); + return; +} + +void +ra_frame_fill (ra_page_t *page, call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min (page->size - src_offset, + local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } + + fill = fill->next; + while (fill != &local->fill) { + if (fill->offset > page->offset) { + break; + } + fill = fill->next; + } + + new = CALLOC (1, sizeof (*new)); + + new->offset = page->offset; + new->size = copy_size; + new->refs = dict_ref (page->ref); + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + NULL); + new->vector = CALLOC (new->count, sizeof (struct iovec)); + + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + new->vector); + + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; + + local->op_ret += copy_size; + } +} + + +void +ra_frame_unwind (call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector; + int32_t copied = 0; + dict_t *refs = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + local = frame->local; + fill = local->fill.next; + + refs = get_new_dict (); + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = CALLOC (count, sizeof (*vector)); + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + memcpy (((char *)vector) + copied, fill->vector, + fill->count * sizeof (*vector)); + + copied += (fill->count * sizeof (*vector)); + dict_copy (fill->refs, refs); + + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; + + dict_unref (fill->refs); + free (fill->vector); + free (fill); + + fill = next; + } + + frame->root->rsp_refs = dict_ref (refs); + + fd = local->fd; + ret = fd_ctx_get (fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + vector, count, &file->stbuf); + + dict_unref (refs); + pthread_mutex_destroy (&local->local_lock); + free (local); + free (vector); + + return; +} + +/* + * ra_frame_return - + * @frame: + * + */ +void +ra_frame_return (call_frame_t *frame) +{ + ra_local_t *local = NULL; + int32_t wait_count = 0; + + local = frame->local; + assert (local->wait_count > 0); + + ra_local_lock (local); + { + wait_count = --local->wait_count; + } + ra_local_unlock (local); + + if (!wait_count) + ra_frame_unwind (frame); + + return; +} + +/* + * ra_page_wakeup - + * @page: + * + */ +ra_waitq_t * +ra_page_wakeup (ra_page_t *page) +{ + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame; + + waitq = page->waitq; + page->waitq = NULL; + + trav = waitq; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill (page, frame); + } + + return waitq; +} + +/* + * ra_page_purge - + * @page: + * + */ +void +ra_page_purge (ra_page_t *page) +{ + page->prev->next = page->next; + page->next->prev = page->prev; + + if (page->ref) { + dict_unref (page->ref); + } + free (page->vector); + free (page); +} + +/* + * ra_page_error - + * @page: + * @op_ret: + * @op_errno: + * + */ +ra_waitq_t * +ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +{ + + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; + + waitq = page->waitq; + page->waitq = NULL; + + trav = waitq; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + + ra_page_purge (page); + + return waitq; +} + +/* + * ra_file_destroy - + * @file: + * + */ +void +ra_file_destroy (ra_file_t *file) +{ + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; + + conf = file->conf; + + ra_conf_lock (conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock (conf); + + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error (trav, -1, EINVAL); + trav = file->pages.next; + } + + pthread_mutex_destroy (&file->file_lock); + free (file); +} + diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c new file mode 100644 index 00000000000..0060e00fd41 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -0,0 +1,890 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* + TODO: + - handle O_DIRECT + - maintain offset, flush on lseek + - ensure efficient memory managment in case of random seek +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> +#include <sys/time.h> + + +static void +read_ahead (call_frame_t *frame, + ra_file_t *file); + + +int +ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = CALLOC (1, sizeof (*file)); + if (!file) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + + if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) + file->disabled = 1; + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + + +int +ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = CALLOC (1, sizeof (*file)); + if (!file) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + + if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) + file->disabled = 1; + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + //file->size = fd->inode->buf.st_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + + return 0; +} + + +int +ra_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + STACK_WIND (frame, ra_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd); + + return 0; +} + +int +ra_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + STACK_WIND (frame, ra_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + + return 0; +} + +/* free cache pages between offset and offset+size, + does not touch pages with frames waiting on it +*/ + +static void +flush_region (call_frame_t *frame, + ra_file_t *file, + off_t offset, + off_t size) +{ + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + + ra_file_lock (file); + { + trav = file->pages.next; + while (trav != &file->pages + && trav->offset < (offset + size)) { + + next = trav->next; + if (trav->offset >= offset && !trav->waitq) { + ra_page_purge (trav); + } + trav = next; + } + } + ra_file_unlock (file); +} + + + +int +ra_release (xlator_t *this, + fd_t *fd) +{ + uint64_t tmp_file = 0; + int ret = 0; + + ret = fd_ctx_del (fd, this, &tmp_file); + + if (!ret) { + ra_file_destroy ((ra_file_t *)(long)tmp_file); + } + + return 0; +} + + +void +read_ahead (call_frame_t *frame, ra_file_t *file) +{ + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + if (!file->page_count) + return; + + ra_size = file->page_size * file->page_count; + ra_offset = floor (file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min (file->offset + ra_size, cap)) { + + ra_file_lock (file); + { + trav = ra_page_get (file, ra_offset); + } + ra_file_unlock (file); + + if (!trav) + break; + + ra_offset += file->page_size; + } + + if (trav) + /* comfortable enough */ + return; + + trav_offset = ra_offset; + + trav = file->pages.next; + cap = file->size ? file->size : ra_offset + ra_size; + + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create (file, trav_offset); + if (trav) + trav->dirty = 1; + } + } + ra_file_unlock (file); + + if (!trav) { + /* OUT OF MEMORY */ + break; + } + + if (fault) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "RA at offset=%"PRId64, trav_offset); + ra_page_fault (file, frame, trav_offset); + } + trav_offset += file->page_size; + } + + return; +} + + +int +ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static void +dispatch_requests (call_frame_t *frame, + ra_file_t *file) +{ + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; + + + local = frame->local; + conf = file->conf; + + rounded_offset = floor (local->offset, file->page_size); + rounded_end = roof (local->offset + local->size, file->page_size); + + trav_offset = rounded_offset; + trav = file->pages.next; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + trav = ra_page_create (file, trav_offset); + fault = 1; + need_atime_update = 0; + } + + if (!trav) + goto unlock; + + if (trav->ready) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "HIT at offset=%"PRId64".", + trav_offset); + ra_frame_fill (trav, frame); + } else { + gf_log (frame->this->name, GF_LOG_DEBUG, + "IN-TRANSIT at offset=%"PRId64".", + trav_offset); + ra_wait_on_page (trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock (file); + + if (fault) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "MISS at offset=%"PRId64".", + trav_offset); + ra_page_fault (file, frame, trav_offset); + } + + trav_offset += file->page_size; + } + + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame (frame); + STACK_WIND (ra_frame, ra_need_atime_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + file->fd, 1, 1); + } + + return ; +} + + +int +ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + + +int +ra_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = 0; + int ret = 0; + char expected_offset = 1; + uint64_t tmp_file = 0; + + conf = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", + offset, size); + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file->offset != offset) { + gf_log (this->name, GF_LOG_DEBUG, + "unexpected offset (%"PRId64" != %"PRId64") resetting", + file->offset, offset); + + expected_offset = file->expected = file->page_count = 0; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "expected offset (%"PRId64") when page_count=%d", + offset, file->page_count); + + if (file->expected < (conf->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min ((file->expected / file->page_size), + conf->page_count); + } + } + + if (!expected_offset) { + flush_region (frame, file, 0, file->pages.prev->offset + 1); + } + + if (file->disabled) { + STACK_WIND (frame, ra_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + file->fd, size, offset); + return 0; + } + + local = (void *) CALLOC (1, sizeof (*local)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto unwind; + } + + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; + + local->fill.next = &local->fill; + local->fill.prev = &local->fill; + + pthread_mutex_init (&local->local_lock, NULL); + + frame->local = local; + + dispatch_requests (frame, file); + + flush_region (frame, file, 0, floor (offset, file->page_size)); + + read_ahead (frame, file); + + ra_frame_return (frame); + + file->offset = offset + size; + + return 0; + +unwind: + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +ra_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int +ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + STACK_WIND (frame, ra_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, + fd); + return 0; +} + + +int +ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + STACK_WIND (frame, ra_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, datasync); + return 0; +} + + +int +ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + fd_t *fd = NULL; + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + fd = frame->local; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + frame->local = NULL; + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + + +int +ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; + } + + frame->local = fd; + + STACK_WIND (frame, ra_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset); + + return 0; +} + + +int +ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int +ra_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = loc->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset); + return 0; +} + + +int +ra_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, + fd); + return 0; +} + + +int +ra_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fchown, + fd, uid, gid); + return 0; +} + + +int +ra_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + fd, offset); + return 0; +} + + +int +init (xlator_t *this) +{ + ra_conf_t *conf; + dict_t *options = this->options; + char *page_size_string = NULL; + char *page_count_string = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: read-ahead not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = (void *) CALLOC (1, sizeof (*conf)); + ERR_ABORT (conf); + conf->page_size = 256 * 1024; + conf->page_count = 2; + + if (dict_get (options, "page-size")) + page_size_string = data_to_str (dict_get (options, + "page-size")); + if (page_size_string) + { + if (gf_string2bytesize (page_size_string, &conf->page_size) != 0) + { + gf_log ("read-ahead", + GF_LOG_ERROR, + "invalid number format \"%s\" of \"option page-size\"", + page_size_string); + return -1; + } + + gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_size = %"PRIu64"", + conf->page_size); + } + + if (dict_get (options, "page-count")) + page_count_string = data_to_str (dict_get (options, + "page-count")); + if (page_count_string) + { + if (gf_string2uint_base10 (page_count_string, &conf->page_count) != 0) + { + gf_log ("read-ahead", + GF_LOG_ERROR, + "invalid number format \"%s\" of \"option page-count\"", + page_count_string); + return -1; + } + gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u", + conf->page_count); + } + + if (dict_get (options, "force-atime-update")) { + char *force_atime_update_str = data_to_str (dict_get (options, + "force-atime-update")); + if (gf_string2boolean (force_atime_update_str, &conf->force_atime_update) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "'force-atime-update' takes only boolean options"); + return -1; + } + if (conf->force_atime_update) + gf_log (this->name, GF_LOG_DEBUG, "Forcing atime updates on cache hit"); + } + + conf->files.next = &conf->files; + conf->files.prev = &conf->files; + + pthread_mutex_init (&conf->conf_lock, NULL); + this->private = conf; + return 0; +} + +void +fini (xlator_t *this) +{ + ra_conf_t *conf = this->private; + + pthread_mutex_destroy (&conf->conf_lock); + FREE (conf); + + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .fchown = ra_fchown, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .release = ra_release, +}; + +struct volume_options options[] = { + { .key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 64 * GF_UNIT_KB, + .max = 2 * GF_UNIT_MB + }, + { .key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16 + }, + { .key = {NULL} }, +}; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h new file mode 100644 index 00000000000..d624ca8abc8 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -0,0 +1,194 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __READ_AHEAD_H +#define __READ_AHEAD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "common-utils.h" + +struct ra_conf; +struct ra_local; +struct ra_page; +struct ra_file; +struct ra_waitq; + + +struct ra_waitq { + struct ra_waitq *next; + void *data; +}; + + +struct ra_fill { + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + dict_t *refs; +}; + + +struct ra_local { + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; +}; + + +struct ra_page { + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; + dict_t *ref; +}; + + +struct ra_file { + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct stat stbuf; + uint64_t page_size; + uint32_t page_count; +}; + + +struct ra_conf { + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; +}; + + +typedef struct ra_conf ra_conf_t; +typedef struct ra_local ra_local_t; +typedef struct ra_page ra_page_t; +typedef struct ra_file ra_file_t; +typedef struct ra_waitq ra_waitq_t; +typedef struct ra_fill ra_fill_t; + +ra_page_t * +ra_page_get (ra_file_t *file, + off_t offset); +ra_page_t * +ra_page_create (ra_file_t *file, + off_t offset); +void +ra_page_fault (ra_file_t *file, + call_frame_t *frame, + off_t offset); +void +ra_wait_on_page (ra_page_t *page, + call_frame_t *frame); +ra_waitq_t * +ra_page_wakeup (ra_page_t *page); + +void +ra_page_flush (ra_page_t *page); + +ra_waitq_t * +ra_page_error (ra_page_t *page, + int32_t op_ret, + int32_t op_errno); +void +ra_page_purge (ra_page_t *page); + +void +ra_frame_return (call_frame_t *frame); +void +ra_frame_fill (ra_page_t *page, + call_frame_t *frame); + +void +ra_file_destroy (ra_file_t *file); + +static inline void +ra_file_lock (ra_file_t *file) +{ + pthread_mutex_lock (&file->file_lock); +} + +static inline void +ra_file_unlock (ra_file_t *file) +{ + pthread_mutex_unlock (&file->file_lock); +} + +static inline void +ra_conf_lock (ra_conf_t *conf) +{ + pthread_mutex_lock (&conf->conf_lock); +} + +static inline void +ra_conf_unlock (ra_conf_t *conf) +{ + pthread_mutex_unlock (&conf->conf_lock); +} +static inline void +ra_local_lock (ra_local_t *local) +{ + pthread_mutex_lock (&local->local_lock); +} + +static inline void +ra_local_unlock (ra_local_t *local) +{ + pthread_mutex_unlock (&local->local_lock); +} + +#endif /* __READ_AHEAD_H */ |