summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaghavendra G <raghavendra@gluster.com>2009-08-23 22:28:18 +0000
committerAnand V. Avati <avati@dev.gluster.com>2009-09-08 01:40:54 -0700
commitbcd092a21f4284277a7f59c58715bb253ed90ff7 (patch)
treeccc0e1b0fcf02e7a22282148ff88adda9bd999e6
parent314eb5fecf61b61ae9ba6bd76a44ea14bbd04513 (diff)
rewriting stat-prefetch translator
- stat-prefetch aims to optimize operations like 'ls -l' where a readdir is immediately followed by stat calls on each of the directory entry read. More details on design can be found in doc/stat-prefetch-design.txt Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 221 (stat prefetch implementation) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=221
-rw-r--r--configure.ac2
-rw-r--r--doc/stat-prefetch-design.txt128
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/stat-prefetch/src/Makefile.am11
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.c521
-rw-r--r--xlators/performance/stat-prefetch/src/stat-prefetch.h12
6 files changed, 177 insertions, 499 deletions
diff --git a/configure.ac b/configure.ac
index 6a951096822..6fbaab2c852 100644
--- a/configure.ac
+++ b/configure.ac
@@ -67,6 +67,8 @@ AC_CONFIG_FILES([Makefile
xlators/performance/symlink-cache/src/Makefile
xlators/performance/quick-read/Makefile
xlators/performance/quick-read/src/Makefile
+ xlators/performance/stat-prefetch/Makefile
+ xlators/performance/stat-prefetch/src/Makefile
xlators/debug/Makefile
xlators/debug/trace/Makefile
xlators/debug/trace/src/Makefile
diff --git a/doc/stat-prefetch-design.txt b/doc/stat-prefetch-design.txt
new file mode 100644
index 00000000000..65f1b922705
--- /dev/null
+++ b/doc/stat-prefetch-design.txt
@@ -0,0 +1,128 @@
+what is stat-prefetch?
+======================
+It is a translator which caches the dentries read in readdir. This dentry
+list is stored in the context of fd. Later when lookup happens on
+[parent-inode, basename (path)] combination, this list is searched for the
+basename. The dentry thus searched is used to fill up the stat corresponding
+to path being looked upon, thereby short-cutting lookup calls. This cache is
+preserved till closedir is called on the fd. The purpose of this translator
+is to optimize operations like 'ls -l', where a readdir is followed by
+lookup (stat) calls on each directory entry.
+
+1. stat-prefetch harnesses the efficiency of short lookup calls
+ (saves network roundtrip time for lookup calls from being accounted to
+ the stat call).
+2. To maintain the correctness, it does lookup-behind - lookup is winded to
+ underlying translators after it is unwound to upper translators.
+ A lookup-behind is necessary as inode gets populated in server inode table
+ only in lookup-cbk. Also various translators store their contexts in inode
+ contexts during lookup calls.
+
+fops to be implemented:
+======================
+* lookup
+ Check the dentry cache stored in context of fds opened by the same process
+ on parent inode for basename. If found unwind with cached stat, else wind
+ the lookup call to underlying translators. We also store the stat path in
+ context of inode if the path being looked upon happens to be directory.
+ This stat will be used to fill postparent stat when lookup happens on any of
+ the directory contents.
+
+* readdir
+ Cache the direntries returned in readdir_cbk in the context of fd. If the
+ readdir is happening on non-expected offsets (means a seekdir/rewinddir
+ has happened), cache has to be flushed.
+
+* chmod/fchmod
+ Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent inode, since these calls change st_mode and ctime of
+ stat.
+
+* chown/fchown
+ Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent inode, since these calls change st_uid/st_gid and
+ st_ctime of stat.
+
+* truncate/ftruncate
+ Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent inode, since these calls change st_size/st_mtime of stat.
+
+* utimens
+ Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent inode, since this call changes st_atime/st_mtime of stat.
+
+* readlink
+ Delete the entry corresponding to basename from cache stored in context of fds
+ opened on parent inode, since this call changes st_atime of stat.
+
+* unlink
+ 1. Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent directory containing file being unlinked.
+ 2. Delete the entry corresponding to basename of parent directory from cache
+ of its parent directory.
+
+* rmdir
+ 1. Delete the entry corresponding to basename from cache stored in context of
+ fds opened on parent inode.
+ 2. Remove the entire cache from all fds opened on inode corresponding to
+ directory being removed.
+
+* readv
+ Delete the entry corresponding to basename from cache stored in context of fds
+ opened on parent inode, since readv changes st_atime of file.
+
+* writev
+ Delete the entry corresponding to basename from cache stored in context of fds
+ opened on parent inode, since writev can possibly change st_size and definitely
+ changes st_mtime of file.
+
+* fsync
+ There is a confusion here as to whether fsync updates mtime/ctimes. Disk based
+ filesystems (atleast ext2) just writes the times stored in inode to disk
+ during fsync and not the time at which fsync is being done. But in glusterfs,
+ a translator like write-behind actually sends writes during fsync which will
+ change mtime/ctime. Hence stat-prefetch implements fsync to delete the entry
+ corresponding to basename from cache stored in context of fds opened on parent
+ inode.
+
+* rename
+ 1. remove entry corresponding to oldname from cache stored in fd contexts of
+ old parent directory.
+ 2. remove entry corresponding to new parent directory from cache stored in
+ fd contexts of its parent directory.
+
+* create/mknod/mkdir/symlink/link
+ Delete entry corresponding to basename of directory in which these operations
+ are happening, from cache stored in context of fds of parent directory. Note
+ that the parent directory containing the cahce is of the directory in which
+ these operations are happening.
+
+* setxattr/removexattr
+ Delete the entry corresponding to basename from cache stored in context of fds
+ opened on parent inode, since setxattr changes st_ctime of file.
+
+* setdents/getdents/checksum/xattrop/fxattrop
+ These calls modify various times of stat structure, hence appropriate entries
+ have to be removed from the cache. I am leaving these calls unimplemented in
+ stat-prefetch for timebeing. Once we have a working translator, these five fops
+ will be implemented.
+
+callbacks to be implemented:
+=======================
+* releasedir
+ Flush the stat-prefetch cache.
+
+* forget
+ Free the stat if the inode corresponds to a directory.
+
+limitations:
+============
+* since a readdir does not return extended attributes of file, if need_xattr is
+ set, short-cutting of lookup does not happen and lookup is passed to
+ underlying translators.
+
+* posix_readdir does not check whether the dentries are spanning across multiple
+ mount points. Hence it is not transforming inode numbers in stat buffers if
+ posix is configured to allow export directory spanning on multiple mountpoints.
+ This is a bug which needs to be fixed. posix_readdir should treat dentries the
+ same way as if lookup is happening on dentries.
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am
index 6b5facca51e..e91d5f6efc8 100644
--- a/xlators/performance/Makefile.am
+++ b/xlators/performance/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read
+SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read stat-prefetch
CLEANFILES =
diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am
index e52f2df48fd..b16c133a1ab 100644
--- a/xlators/performance/stat-prefetch/src/Makefile.am
+++ b/xlators/performance/stat-prefetch/src/Makefile.am
@@ -1,11 +1,14 @@
-xlator_PROGRAMS = stat-prefetch.so
+xlator_LTLIBRARIES = stat-prefetch.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-stat_prefetch_so_SOURCES = stat-prefetch.c
+stat_prefetch_la_LDFLAGS = -module -avoidversion
+stat_prefetch_la_SOURCES = stat-prefetch.c
noinst_HEADERS = stat-prefetch.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles
+stat_prefetch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c
index c6bf1e684cf..6a10ac4c42c 100644
--- a/xlators/performance/stat-prefetch/src/stat-prefetch.c
+++ b/xlators/performance/stat-prefetch/src/stat-prefetch.c
@@ -1,508 +1,53 @@
/*
- Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2009-2010 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
#include "stat-prefetch.h"
-#include "dict.h"
-#include "xlator.h"
-#include <sys/time.h>
-
-struct sp_cache {
- struct sp_cache *next;
- struct sp_cache *prev;
- pid_t pid;
- long long tv_time;
- char *dirname;
- dir_entry_t entries;
- int32_t count;
- pthread_mutex_t lock;
-};
-
-static void
-stat_prefetch_cache_flush (struct sp_cache *cache, int32_t force)
-{
- struct sp_cache *trav;
- struct timeval tv;
- long long tv_time;
-
- gettimeofday (&tv, NULL);
- tv_time = (tv.tv_usec + (tv.tv_sec * 1000000));
-
- pthread_mutex_lock (&cache->lock);
-
- trav = cache->next;
- while (trav != cache) {
- struct sp_cache *next = trav->next;
- {
- if (tv_time > trav->tv_time || force) {
- gf_log ("stat-prefetch",
- GF_LOG_DEBUG,
- "flush on: %s",
- trav->dirname);
- dir_entry_t *entries;
-
- trav->prev->next = trav->next;
- trav->next->prev = trav->prev;
-
- entries = trav->entries.next;
-
- while (entries) {
- dir_entry_t *nextentry = entries->next;
- {
- free (entries->name);
- free (entries);
- }
- entries = nextentry;
- }
- free (trav->dirname);
- free (trav);
- }
- }
- trav = next;
- }
-
- pthread_mutex_unlock (&cache->lock);
-}
-
-static int32_t
-stat_prefetch_cache_fill (struct sp_cache *cache,
- pid_t pid,
- char *dirname,
- dir_entry_t *entries)
-{
- struct sp_cache *trav;
- struct timeval tv;
-
- pthread_mutex_unlock (&cache->lock);
- trav = cache->next;
- while (trav != cache) {
- // if (trav->pid == pid && !strcmp (trav->dirname, dirname)) {
- if (!strcmp (trav->dirname, dirname)) {
- break;
- }
- trav = trav->next;
- }
-
- if (trav == cache) {
- trav = CALLOC (1, sizeof (*trav));
- ERR_ABORT (trav);
- trav->pid = pid;
- trav->dirname = dirname;
-
- trav->prev = cache->prev;
- trav->next = cache;
- trav->next->prev = trav;
- trav->prev->next = trav;
- } else {
- free (dirname);
- }
-
- while (trav->entries.next) {
- dir_entry_t *tmp = trav->entries.next;
-
- trav->entries.next = trav->entries.next->next;
- free (tmp->name);
- free (tmp);
- }
- trav->entries.next = entries->next;
- entries->next = NULL;
-
- gettimeofday (&tv, NULL);
- trav->tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)) + cache->tv_time;
-
- pthread_mutex_unlock (&cache->lock);
- return 0;
-}
-
-static int32_t
-stat_prefetch_cache_lookup (struct sp_cache *cache,
- pid_t pid,
- const char *path,
- struct stat *buf)
-{
- struct sp_cache *trav;
- char *dirname = strdup (path);
- char *filename = strrchr (dirname, '/');
- dir_entry_t *entries;
- dir_entry_t *prev = NULL;
-
- *filename = '\0';
- filename ++;
-
- pthread_mutex_lock (&cache->lock);
- trav = cache->next;
- while (trav != cache) {
- // if ((trav->pid == pid) && !strcmp (dirname, trav->dirname))
- if (!strcmp (dirname, trav->dirname))
- break;
- trav = trav->next;
- }
- if (trav == cache) {
- free (dirname);
- pthread_mutex_unlock (&cache->lock);
- return -1;
- }
-
- entries = trav->entries.next;
- prev = &trav->entries;
- while (entries) {
- if (!strcmp (entries->name, filename))
- break;
- prev = entries;
- entries = entries->next;
- }
- if (!entries) {
- free (dirname);
- pthread_mutex_unlock (&cache->lock);
- return -1;
- }
-
- *buf = entries->buf;
- prev->next = entries->next;
- free (entries->name);
- free (entries);
- free (dirname);
-
- pthread_mutex_unlock (&cache->lock);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_readdir_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dir_entry_t *entries,
- int32_t count)
-{
- char *path = frame->local;
- pid_t pid = frame->root->pid;
- frame->local = NULL;
-
- STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-
- if (op_ret == 0)
- stat_prefetch_cache_fill (this->private,
- pid,
- path,
- entries);
- else
- free (path);
-
- return 0;
-}
-
-int32_t
-stat_prefetch_readdir (call_frame_t *frame,
- xlator_t *this,
- const char *path)
-{
- stat_prefetch_cache_flush (this->private, 0);
-
- frame->local = strdup (path);
- STACK_WIND (frame,
- stat_prefetch_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- path);
- return 0;
-}
-
-
-int32_t
-stat_prefetch_getattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct stat *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-int32_t
-stat_prefetch_getattr (call_frame_t *frame,
- struct xlator *this,
- const char *path)
-{
- struct stat buf;
- pid_t pid = frame->root->pid;
- stat_prefetch_cache_flush (this->private, 0);
-
- if (stat_prefetch_cache_lookup (this->private,
- pid,
- path,
- &buf) == 0) {
- STACK_UNWIND (frame, 0, 0, &buf);
- return 0;
- }
-
- STACK_WIND (frame,
- stat_prefetch_getattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getattr,
- path);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_unlink_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-int32_t
-stat_prefetch_unlink (call_frame_t *frame,
- struct xlator *this,
- const char *path)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- path);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_chmod_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct stat *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-int32_t
-stat_prefetch_chmod (call_frame_t *frame,
- struct xlator *this,
- const char *path,
- mode_t mode)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_chmod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->chmod,
- path,
- mode);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_chown_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct stat *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-int32_t
-stat_prefetch_chown (call_frame_t *frame,
- struct xlator *this,
- const char *path,
- uid_t uid,
- gid_t gid)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_chown_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->chown,
- path,
- uid,
- gid);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_utimes_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct stat *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-int32_t
-stat_prefetch_utimes (call_frame_t *frame,
- struct xlator *this,
- const char *path,
- struct timespec *tvp)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_utimes_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->utimes,
- path,
- tvp);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_truncate_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct stat *buf)
-{
- STACK_UNWIND (frame, op_ret, op_errno, buf);
- return 0;
-}
-
-int32_t
-stat_prefetch_truncate (call_frame_t *frame,
- struct xlator *this,
- const char *path,
- off_t offset)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- path,
- offset);
-
- return 0;
-}
-
-
-int32_t
-stat_prefetch_rename_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno)
-{
- STACK_UNWIND (frame, op_ret, op_errno);
- return 0;
-}
-
-int32_t
-stat_prefetch_rename (call_frame_t *frame,
- struct xlator *this,
- const char *oldpath,
- const char *newpath)
-{
- stat_prefetch_cache_flush (this->private, 1);
-
- STACK_WIND (frame,
- stat_prefetch_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldpath,
- newpath);
-
- return 0;
-}
int32_t
-init (struct xlator *this)
+init (xlator_t *this)
{
- struct sp_cache *cache;
- dict_t *options = this->options;
+ int32_t ret = -1;
+ if (!this->children || this->children->next) {
+ gf_log ("stat-prefetch",
+ GF_LOG_ERROR,
+ "FATAL: translator %s does not have exactly one child "
+ "node", this->name);
+ goto out;
+ }
- if (!this->children || this->children->next) {
- gf_log ("stat-prefetch",
- GF_LOG_ERROR,
- "FATAL: translator %s does not have exactly one child node",
- this->name);
- return -1;
- }
-
- cache = (void *) CALLOC (1, sizeof (*cache));
- ERR_ABORT (cache);
- cache->next = cache->prev = cache;
-
- cache->tv_time = 1 * 1000000;
-
- if (dict_get (options, "cache-seconds")) {
- cache->tv_time = (data_to_int64 (dict_get (options, "cache-seconds")) *
- 1000000);
- }
-
- pthread_mutex_init (&cache->lock, NULL);
-
- this->private = cache;
- return 0;
+ ret = 0;
+out:
+ return ret;
}
void
-fini (struct xlator *this)
+fini (xlator_t *this)
{
- return;
+ return;
}
struct xlator_fops fops = {
- .getattr = stat_prefetch_getattr,
- .readdir = stat_prefetch_readdir,
- .unlink = stat_prefetch_unlink,
- .chmod = stat_prefetch_chmod,
- .chown = stat_prefetch_chown,
- .rename = stat_prefetch_rename,
- .utimes = stat_prefetch_utimes,
- .truncate = stat_prefetch_truncate,
};
struct xlator_mops mops = {
};
+
+struct xlator_cbks cbks = {
+};
diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h
index ef82952b0c7..bd8d9e7eb17 100644
--- a/xlators/performance/stat-prefetch/src/stat-prefetch.h
+++ b/xlators/performance/stat-prefetch/src/stat-prefetch.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2009-2010 Z RESEARCH, Inc. <http://www.zresearch.com>
This file is part of GlusterFS.
GlusterFS is free software; you can redistribute it and/or modify
@@ -17,16 +17,16 @@
<http://www.gnu.org/licenses/>.
*/
-#ifndef _STAT_PREFETCH_H_
-#define _STAT_PREFETCH_H_
+#ifndef _STAT_PREFETCH_H
+#define _STAT_PREFETCH_H
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include <stdio.h>
-#include <sys/time.h>
+#include "glusterfs.h"
+#include "dict.h"
#include "xlator.h"
-#endif /* _STAT_PREFETCH_H_ */
+#endif /* #ifndef _STAT_PREFETCH_H */