From 561624aad540c4a7df49ab62bd8a9a75810d76b2 Mon Sep 17 00:00:00 2001 From: Amar Tumballi Date: Fri, 11 Jan 2019 15:00:27 +0530 Subject: fuse: add --lru-limit option The inode LRU mechanism is moot in fuse xlator (ie. there is no limit for the LRU list), as fuse inodes are referenced from kernel context, and thus they can only be dropped on request of the kernel. This might results in a high number of passive inodes which are useless for the glusterfs client, causing a significant memory overhead. This change tries to remedy this by extending the LRU semantics and allowing to set a finite limit on the fuse inode LRU. A brief history of problem: When gluster's inode table was designed, fuse didn't have any 'invalidate' method, which means, userspace application could never ask kernel to send a 'forget()' fop, instead had to wait for kernel to send it based on kernel's parameters. Inode table remembers the number of times kernel has cached the inode based on the 'nlookup' parameter. And 'nlookup' field is not used by no other entry points (like server-protocol, gfapi etc). Hence the inode_table of fuse module always has to have lru-limit as '0', which means no limit. GlusterFS always had to keep all inodes in memory as kernel would have had a reference to it. Again, the reason for this is, kernel's glusterfs inode reference was pointer of 'inode_t' structure in glusterfs. As it is a pointer, we could never free it (to prevent segfault, or memory corruption). Solution: In the inode table, handle the prune case of inodes with 'nlookup' differently, and call a 'invalidator' method, which in this case is fuse_invalidate(), and it sends the request to kernel for getting the forget request. When the kernel sends the forget, it means, it has dropped all the reference to the inode, and it will send the forget with the 'nlookup' parameter too. We just need to make sure to reduce the 'nlookup' value we have when we get forget. That automatically cause the relevant prune to happen. Credits: Csaba Henk, Xavier Hernandez, Raghavendra Gowdappa, Nithya B fixes: bz#1623107 Change-Id: Ifee0737b23b12b1426c224ec5b8f591f487d83a2 Signed-off-by: Amar Tumballi --- xlators/mount/fuse/src/fuse-bridge.c | 127 +++++++++++++++++----------- xlators/mount/fuse/src/fuse-bridge.h | 3 + xlators/mount/fuse/utils/mount.glusterfs.in | 7 ++ 3 files changed, 86 insertions(+), 51 deletions(-) (limited to 'xlators/mount') diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index efb390a9c54..374f20e106f 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -215,8 +215,8 @@ check_and_dump_fuse_W(fuse_private_t *priv, struct iovec *iov_out, int count, struct fuse_out_header *fouh = NULL; if (res == -1) { - gf_log("glusterfs-fuse", GF_LOG_ERROR, - "writing to fuse device failed: %s", strerror(errno)); + gf_log_callingfn("glusterfs-fuse", GF_LOG_ERROR, + "writing to fuse device failed: %s", strerror(errno)); return errno; } @@ -311,29 +311,29 @@ send_fuse_data(xlator_t *this, fuse_in_header_t *finh, void *data, size_t size) #define send_fuse_obj(this, finh, obj) \ send_fuse_data(this, finh, obj, sizeof(*(obj))) -#if FUSE_KERNEL_MINOR_VERSION >= 11 static void fuse_invalidate_entry(xlator_t *this, uint64_t fuse_ino) { +#if FUSE_KERNEL_MINOR_VERSION >= 11 struct fuse_out_header *fouh = NULL; struct fuse_notify_inval_entry_out *fnieo = NULL; fuse_private_t *priv = NULL; dentry_t *dentry = NULL; + dentry_t *tmp = NULL; inode_t *inode = NULL; size_t nlen = 0; fuse_invalidate_node_t *node = NULL; + char gfid_str[UUID_CANONICAL_FORM_LEN + 1]; priv = this->private; - if (!priv->reverse_fuse_thread_started) return; - inode = fuse_ino_to_inode(fuse_ino, this); - if (inode == NULL) { + inode = (inode_t *)(unsigned long)fuse_ino; + if (inode == NULL) return; - } - list_for_each_entry(dentry, &inode->dentry_list, inode_list) + list_for_each_entry_safe(dentry, tmp, &inode->dentry_list, inode_list) { node = GF_CALLOC(1, sizeof(*node), gf_fuse_mt_invalidate_node_t); if (node == NULL) @@ -347,38 +347,41 @@ fuse_invalidate_entry(xlator_t *this, uint64_t fuse_ino) fouh->unique = 0; fouh->error = FUSE_NOTIFY_INVAL_ENTRY; - nlen = strlen(dentry->name); - fouh->len = sizeof(*fouh) + sizeof(*fnieo) + nlen + 1; - fnieo->parent = inode_to_fuse_nodeid(dentry->parent); - - fnieo->namelen = nlen; - strcpy(node->inval_buf + sizeof(*fouh) + sizeof(*fnieo), dentry->name); + if (dentry->name) { + nlen = strlen(dentry->name); + fouh->len = sizeof(*fouh) + sizeof(*fnieo) + nlen + 1; + fnieo->parent = inode_to_fuse_nodeid(dentry->parent); - pthread_mutex_lock(&priv->invalidate_mutex); - { - list_add_tail(&node->next, &priv->invalidate_list); - pthread_cond_signal(&priv->invalidate_cond); + fnieo->namelen = nlen; + strcpy((node->inval_buf + sizeof(*fouh) + sizeof(*fnieo)), + dentry->name); } - pthread_mutex_unlock(&priv->invalidate_mutex); gf_log("glusterfs-fuse", GF_LOG_TRACE, - "INVALIDATE entry: " - "%" PRIu64 "/%s", - fnieo->parent, dentry->name); + "INVALIDATE entry: %" PRIu64 "/%s (gfid:%s)", fnieo->parent, + dentry->name, uuid_utoa(inode->gfid)); if (dentry->parent) { - fuse_log_eh(this, "Invalidated entry %s (parent: %s)", dentry->name, - uuid_utoa(dentry->parent->gfid)); + fuse_log_eh(this, "Invalidated entry %s (parent: %s) gfid:%s", + dentry->name, uuid_utoa(dentry->parent->gfid), + uuid_utoa_r(inode->gfid, gfid_str)); } else { - fuse_log_eh(this, "Invalidated entry %s(nodeid: %" PRIu64 ")", - dentry->name, fnieo->parent); + fuse_log_eh(this, + "Invalidated entry %s(nodeid: %" PRIu64 ") gfid:%s", + dentry->name, fnieo->parent, uuid_utoa(inode->gfid)); + } + + pthread_mutex_lock(&priv->invalidate_mutex); + { + list_add_tail(&node->next, &priv->invalidate_list); + pthread_cond_signal(&priv->invalidate_cond); } + pthread_mutex_unlock(&priv->invalidate_mutex); } - if (inode) - inode_unref(inode); -} #endif + return; +} /* * Send an inval inode notification to fuse. This causes an invalidation of the @@ -399,6 +402,10 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino) if (!priv->reverse_fuse_thread_started) return; + inode = (inode_t *)(unsigned long)fuse_ino; + if (inode == NULL) + return; + node = GF_CALLOC(1, sizeof(*node), gf_fuse_mt_invalidate_node_t); if (node == NULL) return; @@ -418,7 +425,11 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino) fniio->off = 0; fniio->len = -1; - inode = fuse_ino_to_inode(fuse_ino, this); + fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino, + uuid_utoa(inode->gfid)); + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "INVALIDATE inode: %" PRIu64 "(gfid:%s)", fuse_ino, + uuid_utoa(inode->gfid)); pthread_mutex_lock(&priv->invalidate_mutex); { @@ -427,24 +438,22 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino) } pthread_mutex_unlock(&priv->invalidate_mutex); - gf_log("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %" PRIu64, - fuse_ino); - - if (inode) { - fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino, - uuid_utoa(inode->gfid)); - } else { - fuse_log_eh(this, "Invalidated inode %" PRIu64, fuse_ino); - } - - if (inode) - inode_unref(inode); #else gf_log("glusterfs-fuse", GF_LOG_WARNING, - "fuse_invalidate_inode not implemented on OS X due to missing FUSE " - "notification"); + "fuse_invalidate_inode not implemented on this system"); #endif + return; +} + +#if FUSE_KERNEL_MINOR_VERSION >= 11 +/* Need this function for the signature (inode_t *, instead of uint64_t) */ +static int32_t +fuse_inode_invalidate_fn(xlator_t *this, inode_t *inode) +{ + fuse_invalidate_entry(this, (uint64_t)inode); + return 0; } +#endif int send_fuse_err(xlator_t *this, fuse_in_header_t *finh, int error) @@ -707,11 +716,14 @@ do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup) { inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this); + gf_log("fuse", GF_LOG_TRACE, + "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique, + nodeid, nlookup, uuid_utoa(fuse_inode->gfid)); + fuse_log_eh(this, "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid)); - inode_forget(fuse_inode, nlookup); - inode_unref(fuse_inode); + inode_forget_with_unref(fuse_inode, nlookup); } static void @@ -726,10 +738,6 @@ fuse_forget(xlator_t *this, fuse_in_header_t *finh, void *msg, return; } - gf_log("glusterfs-fuse", GF_LOG_TRACE, - "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64, finh->unique, - finh->nodeid, ffi->nlookup); - do_forget(this, finh->unique, finh->nodeid, ffi->nlookup); GF_FREE(finh); @@ -4926,7 +4934,9 @@ fuse_thread_proc(void *data) fuse_in_header_t *finh = NULL; struct iovec iov_in[2]; void *msg = NULL; - const size_t msg0_size = sizeof(*finh) + 128; + /* we need 512 extra buffer size for BATCH_FORGET fop. By tests, it is + found to be reduces 'REALLOC()' in the loop */ + const size_t msg0_size = sizeof(*finh) + 512; fuse_handler_t **fuse_ops = NULL; struct pollfd pfd[2] = {{ 0, @@ -5258,7 +5268,12 @@ fuse_graph_setup(xlator_t *this, glusterfs_graph_t *graph) goto unlock; } +#if FUSE_KERNEL_MINOR_VERSION >= 11 + itable = inode_table_with_invalidator(priv->lru_limit, graph->top, + fuse_inode_invalidate_fn, this); +#else itable = inode_table_new(0, graph->top); +#endif if (!itable) { ret = -1; goto unlock; @@ -5708,6 +5723,8 @@ init(xlator_t *this_xl) } } + GF_OPTION_INIT("lru-limit", priv->lru_limit, uint32, cleanup_exit); + GF_OPTION_INIT("event-history", priv->event_history, bool, cleanup_exit); GF_OPTION_INIT("thin-client", priv->thin_client, bool, cleanup_exit); @@ -6025,5 +6042,13 @@ struct volume_options options[] = { .max = 1000000000, .description = "Supported granularity of file attribute times.", }, + { + .key = {"lru-limit"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "131072", + .min = 0, + .description = "makes glusterfs invalidate kernel inodes after " + "reaching this limit (0 means 'unlimited')", + }, {.key = {NULL}}, }; diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h index 318f33b5d61..84022ca1ecc 100644 --- a/xlators/mount/fuse/src/fuse-bridge.h +++ b/xlators/mount/fuse/src/fuse-bridge.h @@ -151,6 +151,9 @@ struct fuse_private { /* Writeback cache support */ gf_boolean_t kernel_writeback_cache; int attr_times_granularity; + + /* LRU Limit, if not set, default is 128k for now */ + uint32_t lru_limit; }; typedef struct fuse_private fuse_private_t; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 959f2272e2f..ceeeb100952 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -249,6 +249,10 @@ start_glusterfs () cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout"); fi + if [ -n "$lru_limit" ]; then + cmd_line=$(echo "$cmd_line --lru-limit=$lru_limit"); + fi + if [ -n "$bg_qlen" ]; then cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen"); fi @@ -485,6 +489,9 @@ with_options() "gid-timeout") gid_timeout=$value ;; + "lru-limit") + lru_limit=$value + ;; "background-qlen") bg_qlen=$value ;; -- cgit