From bcd092a21f4284277a7f59c58715bb253ed90ff7 Mon Sep 17 00:00:00 2001 From: Raghavendra G Date: Sun, 23 Aug 2009 22:28:18 +0000 Subject: rewriting stat-prefetch translator - stat-prefetch aims to optimize operations like 'ls -l' where a readdir is immediately followed by stat calls on each of the directory entry read. More details on design can be found in doc/stat-prefetch-design.txt Signed-off-by: Anand V. Avati BUG: 221 (stat prefetch implementation) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=221 --- configure.ac | 2 + doc/stat-prefetch-design.txt | 128 +++++ xlators/performance/Makefile.am | 2 +- xlators/performance/stat-prefetch/src/Makefile.am | 11 +- .../performance/stat-prefetch/src/stat-prefetch.c | 521 ++------------------- .../performance/stat-prefetch/src/stat-prefetch.h | 12 +- 6 files changed, 177 insertions(+), 499 deletions(-) create mode 100644 doc/stat-prefetch-design.txt diff --git a/configure.ac b/configure.ac index 6a951096..6fbaab2c 100644 --- a/configure.ac +++ b/configure.ac @@ -67,6 +67,8 @@ AC_CONFIG_FILES([Makefile xlators/performance/symlink-cache/src/Makefile xlators/performance/quick-read/Makefile xlators/performance/quick-read/src/Makefile + xlators/performance/stat-prefetch/Makefile + xlators/performance/stat-prefetch/src/Makefile xlators/debug/Makefile xlators/debug/trace/Makefile xlators/debug/trace/src/Makefile diff --git a/doc/stat-prefetch-design.txt b/doc/stat-prefetch-design.txt new file mode 100644 index 00000000..65f1b922 --- /dev/null +++ b/doc/stat-prefetch-design.txt @@ -0,0 +1,128 @@ +what is stat-prefetch? +====================== +It is a translator which caches the dentries read in readdir. This dentry +list is stored in the context of fd. Later when lookup happens on +[parent-inode, basename (path)] combination, this list is searched for the +basename. The dentry thus searched is used to fill up the stat corresponding +to path being looked upon, thereby short-cutting lookup calls. This cache is +preserved till closedir is called on the fd. The purpose of this translator +is to optimize operations like 'ls -l', where a readdir is followed by +lookup (stat) calls on each directory entry. + +1. stat-prefetch harnesses the efficiency of short lookup calls + (saves network roundtrip time for lookup calls from being accounted to + the stat call). +2. To maintain the correctness, it does lookup-behind - lookup is winded to + underlying translators after it is unwound to upper translators. + A lookup-behind is necessary as inode gets populated in server inode table + only in lookup-cbk. Also various translators store their contexts in inode + contexts during lookup calls. + +fops to be implemented: +====================== +* lookup + Check the dentry cache stored in context of fds opened by the same process + on parent inode for basename. If found unwind with cached stat, else wind + the lookup call to underlying translators. We also store the stat path in + context of inode if the path being looked upon happens to be directory. + This stat will be used to fill postparent stat when lookup happens on any of + the directory contents. + +* readdir + Cache the direntries returned in readdir_cbk in the context of fd. If the + readdir is happening on non-expected offsets (means a seekdir/rewinddir + has happened), cache has to be flushed. + +* chmod/fchmod + Delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode, since these calls change st_mode and ctime of + stat. + +* chown/fchown + Delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode, since these calls change st_uid/st_gid and + st_ctime of stat. + +* truncate/ftruncate + Delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode, since these calls change st_size/st_mtime of stat. + +* utimens + Delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode, since this call changes st_atime/st_mtime of stat. + +* readlink + Delete the entry corresponding to basename from cache stored in context of fds + opened on parent inode, since this call changes st_atime of stat. + +* unlink + 1. Delete the entry corresponding to basename from cache stored in context of + fds opened on parent directory containing file being unlinked. + 2. Delete the entry corresponding to basename of parent directory from cache + of its parent directory. + +* rmdir + 1. Delete the entry corresponding to basename from cache stored in context of + fds opened on parent inode. + 2. Remove the entire cache from all fds opened on inode corresponding to + directory being removed. + +* readv + Delete the entry corresponding to basename from cache stored in context of fds + opened on parent inode, since readv changes st_atime of file. + +* writev + Delete the entry corresponding to basename from cache stored in context of fds + opened on parent inode, since writev can possibly change st_size and definitely + changes st_mtime of file. + +* fsync + There is a confusion here as to whether fsync updates mtime/ctimes. Disk based + filesystems (atleast ext2) just writes the times stored in inode to disk + during fsync and not the time at which fsync is being done. But in glusterfs, + a translator like write-behind actually sends writes during fsync which will + change mtime/ctime. Hence stat-prefetch implements fsync to delete the entry + corresponding to basename from cache stored in context of fds opened on parent + inode. + +* rename + 1. remove entry corresponding to oldname from cache stored in fd contexts of + old parent directory. + 2. remove entry corresponding to new parent directory from cache stored in + fd contexts of its parent directory. + +* create/mknod/mkdir/symlink/link + Delete entry corresponding to basename of directory in which these operations + are happening, from cache stored in context of fds of parent directory. Note + that the parent directory containing the cahce is of the directory in which + these operations are happening. + +* setxattr/removexattr + Delete the entry corresponding to basename from cache stored in context of fds + opened on parent inode, since setxattr changes st_ctime of file. + +* setdents/getdents/checksum/xattrop/fxattrop + These calls modify various times of stat structure, hence appropriate entries + have to be removed from the cache. I am leaving these calls unimplemented in + stat-prefetch for timebeing. Once we have a working translator, these five fops + will be implemented. + +callbacks to be implemented: +======================= +* releasedir + Flush the stat-prefetch cache. + +* forget + Free the stat if the inode corresponds to a directory. + +limitations: +============ +* since a readdir does not return extended attributes of file, if need_xattr is + set, short-cutting of lookup does not happen and lookup is passed to + underlying translators. + +* posix_readdir does not check whether the dentries are spanning across multiple + mount points. Hence it is not transforming inode numbers in stat buffers if + posix is configured to allow export directory spanning on multiple mountpoints. + This is a bug which needs to be fixed. posix_readdir should treat dentries the + same way as if lookup is happening on dentries. diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am index 6b5facca..e91d5f6e 100644 --- a/xlators/performance/Makefile.am +++ b/xlators/performance/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read +SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read stat-prefetch CLEANFILES = diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am index e52f2df4..b16c133a 100644 --- a/xlators/performance/stat-prefetch/src/Makefile.am +++ b/xlators/performance/stat-prefetch/src/Makefile.am @@ -1,11 +1,14 @@ -xlator_PROGRAMS = stat-prefetch.so +xlator_LTLIBRARIES = stat-prefetch.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -stat_prefetch_so_SOURCES = stat-prefetch.c +stat_prefetch_la_LDFLAGS = -module -avoidversion +stat_prefetch_la_SOURCES = stat-prefetch.c noinst_HEADERS = stat-prefetch.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles +stat_prefetch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c index c6bf1e68..6a10ac4c 100644 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.c +++ b/xlators/performance/stat-prefetch/src/stat-prefetch.c @@ -1,508 +1,53 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . + Copyright (c) 2009-2010 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" #include "stat-prefetch.h" -#include "dict.h" -#include "xlator.h" -#include - -struct sp_cache { - struct sp_cache *next; - struct sp_cache *prev; - pid_t pid; - long long tv_time; - char *dirname; - dir_entry_t entries; - int32_t count; - pthread_mutex_t lock; -}; - -static void -stat_prefetch_cache_flush (struct sp_cache *cache, int32_t force) -{ - struct sp_cache *trav; - struct timeval tv; - long long tv_time; - - gettimeofday (&tv, NULL); - tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)); - - pthread_mutex_lock (&cache->lock); - - trav = cache->next; - while (trav != cache) { - struct sp_cache *next = trav->next; - { - if (tv_time > trav->tv_time || force) { - gf_log ("stat-prefetch", - GF_LOG_DEBUG, - "flush on: %s", - trav->dirname); - dir_entry_t *entries; - - trav->prev->next = trav->next; - trav->next->prev = trav->prev; - - entries = trav->entries.next; - - while (entries) { - dir_entry_t *nextentry = entries->next; - { - free (entries->name); - free (entries); - } - entries = nextentry; - } - free (trav->dirname); - free (trav); - } - } - trav = next; - } - - pthread_mutex_unlock (&cache->lock); -} - -static int32_t -stat_prefetch_cache_fill (struct sp_cache *cache, - pid_t pid, - char *dirname, - dir_entry_t *entries) -{ - struct sp_cache *trav; - struct timeval tv; - - pthread_mutex_unlock (&cache->lock); - trav = cache->next; - while (trav != cache) { - // if (trav->pid == pid && !strcmp (trav->dirname, dirname)) { - if (!strcmp (trav->dirname, dirname)) { - break; - } - trav = trav->next; - } - - if (trav == cache) { - trav = CALLOC (1, sizeof (*trav)); - ERR_ABORT (trav); - trav->pid = pid; - trav->dirname = dirname; - - trav->prev = cache->prev; - trav->next = cache; - trav->next->prev = trav; - trav->prev->next = trav; - } else { - free (dirname); - } - - while (trav->entries.next) { - dir_entry_t *tmp = trav->entries.next; - - trav->entries.next = trav->entries.next->next; - free (tmp->name); - free (tmp); - } - trav->entries.next = entries->next; - entries->next = NULL; - - gettimeofday (&tv, NULL); - trav->tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)) + cache->tv_time; - - pthread_mutex_unlock (&cache->lock); - return 0; -} - -static int32_t -stat_prefetch_cache_lookup (struct sp_cache *cache, - pid_t pid, - const char *path, - struct stat *buf) -{ - struct sp_cache *trav; - char *dirname = strdup (path); - char *filename = strrchr (dirname, '/'); - dir_entry_t *entries; - dir_entry_t *prev = NULL; - - *filename = '\0'; - filename ++; - - pthread_mutex_lock (&cache->lock); - trav = cache->next; - while (trav != cache) { - // if ((trav->pid == pid) && !strcmp (dirname, trav->dirname)) - if (!strcmp (dirname, trav->dirname)) - break; - trav = trav->next; - } - if (trav == cache) { - free (dirname); - pthread_mutex_unlock (&cache->lock); - return -1; - } - - entries = trav->entries.next; - prev = &trav->entries; - while (entries) { - if (!strcmp (entries->name, filename)) - break; - prev = entries; - entries = entries->next; - } - if (!entries) { - free (dirname); - pthread_mutex_unlock (&cache->lock); - return -1; - } - - *buf = entries->buf; - prev->next = entries->next; - free (entries->name); - free (entries); - free (dirname); - - pthread_mutex_unlock (&cache->lock); - - return 0; -} - - -int32_t -stat_prefetch_readdir_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - dir_entry_t *entries, - int32_t count) -{ - char *path = frame->local; - pid_t pid = frame->root->pid; - frame->local = NULL; - - STACK_UNWIND (frame, op_ret, op_errno, entries, count); - - if (op_ret == 0) - stat_prefetch_cache_fill (this->private, - pid, - path, - entries); - else - free (path); - - return 0; -} - -int32_t -stat_prefetch_readdir (call_frame_t *frame, - xlator_t *this, - const char *path) -{ - stat_prefetch_cache_flush (this->private, 0); - - frame->local = strdup (path); - STACK_WIND (frame, - stat_prefetch_readdir_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->readdir, - path); - return 0; -} - - -int32_t -stat_prefetch_getattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_getattr (call_frame_t *frame, - struct xlator *this, - const char *path) -{ - struct stat buf; - pid_t pid = frame->root->pid; - stat_prefetch_cache_flush (this->private, 0); - - if (stat_prefetch_cache_lookup (this->private, - pid, - path, - &buf) == 0) { - STACK_UNWIND (frame, 0, 0, &buf); - return 0; - } - - STACK_WIND (frame, - stat_prefetch_getattr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getattr, - path); - - return 0; -} - - -int32_t -stat_prefetch_unlink_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -stat_prefetch_unlink (call_frame_t *frame, - struct xlator *this, - const char *path) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_unlink_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->unlink, - path); - - return 0; -} - - -int32_t -stat_prefetch_chmod_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_chmod (call_frame_t *frame, - struct xlator *this, - const char *path, - mode_t mode) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_chmod_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->chmod, - path, - mode); - - return 0; -} - - -int32_t -stat_prefetch_chown_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_chown (call_frame_t *frame, - struct xlator *this, - const char *path, - uid_t uid, - gid_t gid) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_chown_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->chown, - path, - uid, - gid); - - return 0; -} - - -int32_t -stat_prefetch_utimes_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_utimes (call_frame_t *frame, - struct xlator *this, - const char *path, - struct timespec *tvp) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_utimes_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->utimes, - path, - tvp); - - return 0; -} - - -int32_t -stat_prefetch_truncate_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, - struct stat *buf) -{ - STACK_UNWIND (frame, op_ret, op_errno, buf); - return 0; -} - -int32_t -stat_prefetch_truncate (call_frame_t *frame, - struct xlator *this, - const char *path, - off_t offset) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_truncate_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->truncate, - path, - offset); - - return 0; -} - - -int32_t -stat_prefetch_rename_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, op_ret, op_errno); - return 0; -} - -int32_t -stat_prefetch_rename (call_frame_t *frame, - struct xlator *this, - const char *oldpath, - const char *newpath) -{ - stat_prefetch_cache_flush (this->private, 1); - - STACK_WIND (frame, - stat_prefetch_rename_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->rename, - oldpath, - newpath); - - return 0; -} int32_t -init (struct xlator *this) +init (xlator_t *this) { - struct sp_cache *cache; - dict_t *options = this->options; + int32_t ret = -1; + if (!this->children || this->children->next) { + gf_log ("stat-prefetch", + GF_LOG_ERROR, + "FATAL: translator %s does not have exactly one child " + "node", this->name); + goto out; + } - if (!this->children || this->children->next) { - gf_log ("stat-prefetch", - GF_LOG_ERROR, - "FATAL: translator %s does not have exactly one child node", - this->name); - return -1; - } - - cache = (void *) CALLOC (1, sizeof (*cache)); - ERR_ABORT (cache); - cache->next = cache->prev = cache; - - cache->tv_time = 1 * 1000000; - - if (dict_get (options, "cache-seconds")) { - cache->tv_time = (data_to_int64 (dict_get (options, "cache-seconds")) * - 1000000); - } - - pthread_mutex_init (&cache->lock, NULL); - - this->private = cache; - return 0; + ret = 0; +out: + return ret; } void -fini (struct xlator *this) +fini (xlator_t *this) { - return; + return; } struct xlator_fops fops = { - .getattr = stat_prefetch_getattr, - .readdir = stat_prefetch_readdir, - .unlink = stat_prefetch_unlink, - .chmod = stat_prefetch_chmod, - .chown = stat_prefetch_chown, - .rename = stat_prefetch_rename, - .utimes = stat_prefetch_utimes, - .truncate = stat_prefetch_truncate, }; struct xlator_mops mops = { }; + +struct xlator_cbks cbks = { +}; diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h index ef82952b..bd8d9e7e 100644 --- a/xlators/performance/stat-prefetch/src/stat-prefetch.h +++ b/xlators/performance/stat-prefetch/src/stat-prefetch.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2009 Z RESEARCH, Inc. + Copyright (c) 2009-2010 Z RESEARCH, Inc. This file is part of GlusterFS. GlusterFS is free software; you can redistribute it and/or modify @@ -17,16 +17,16 @@ . */ -#ifndef _STAT_PREFETCH_H_ -#define _STAT_PREFETCH_H_ +#ifndef _STAT_PREFETCH_H +#define _STAT_PREFETCH_H #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include -#include +#include "glusterfs.h" +#include "dict.h" #include "xlator.h" -#endif /* _STAT_PREFETCH_H_ */ +#endif /* #ifndef _STAT_PREFETCH_H */ -- cgit