diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2016-12-08 16:24:15 -0500 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2017-01-30 19:13:58 -0500 |
commit | 1a95fc3036db51b82b6a80952f0908bc2019d24a (patch) | |
tree | b983ac196a8165d5cb5e860a5ef97d3e9a41b5c9 /libglusterfs | |
parent | 7f7d7a939e46b330a084d974451eee4757ba61b4 (diff) |
core: run many bricks within one glusterfsd process
This patch adds support for multiple brick translator stacks running
in a single brick server process. This reduces our per-brick memory usage by
approximately 3x, and our appetite for TCP ports even more. It also creates
potential to avoid process/thread thrashing, and to improve QoS by scheduling
more carefully across the bricks, but realizing that potential will require
further work.
Multiplexing is controlled by the "cluster.brick-multiplex" global option. By
default it's off, and bricks are started in separate processes as before. If
multiplexing is enabled, then *compatible* bricks (mostly those with the same
transport options) will be started in the same process.
Change-Id: I45059454e51d6f4cbb29a4953359c09a408695cb
BUG: 1385758
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://review.gluster.org/14763
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r-- | libglusterfs/src/client_t.c | 49 | ||||
-rw-r--r-- | libglusterfs/src/common-utils.c | 15 | ||||
-rw-r--r-- | libglusterfs/src/event-epoll.c | 3 | ||||
-rw-r--r-- | libglusterfs/src/event.h | 16 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 5 | ||||
-rw-r--r-- | libglusterfs/src/graph.c | 127 | ||||
-rw-r--r-- | libglusterfs/src/locking.c | 2 | ||||
-rw-r--r-- | libglusterfs/src/xlator.c | 72 | ||||
-rw-r--r-- | libglusterfs/src/xlator.h | 7 |
9 files changed, 261 insertions, 35 deletions
diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c index b3eb4e4df8c..c20c4089ec3 100644 --- a/libglusterfs/src/client_t.c +++ b/libglusterfs/src/client_t.c @@ -331,11 +331,25 @@ gf_client_ref (client_t *client) static void +gf_client_destroy_recursive (xlator_t *xl, client_t *client) +{ + xlator_list_t *trav; + + if (xl->cbks->client_destroy) { + xl->cbks->client_destroy (xl, client); + } + + for (trav = xl->children; trav; trav = trav->next) { + gf_client_destroy_recursive (trav->xlator, client); + } +} + + +static void client_destroy (client_t *client) { clienttable_t *clienttable = NULL; glusterfs_graph_t *gtrav = NULL; - xlator_t *xtrav = NULL; if (client == NULL){ gf_msg_callingfn ("xlator", GF_LOG_ERROR, EINVAL, @@ -358,12 +372,7 @@ client_destroy (client_t *client) UNLOCK (&clienttable->lock); list_for_each_entry (gtrav, &client->this->ctx->graphs, list) { - xtrav = gtrav->top; - while (xtrav != NULL) { - if (xtrav->cbks->client_destroy != NULL) - xtrav->cbks->client_destroy (xtrav, client); - xtrav = xtrav->next; - } + gf_client_destroy_recursive (gtrav->top, client); } GF_FREE (client->auth.data); GF_FREE (client->auth.username); @@ -375,22 +384,32 @@ out: return; } +static int +gf_client_disconnect_recursive (xlator_t *xl, client_t *client) +{ + int ret = 0; + xlator_list_t *trav; + + if (xl->cbks->client_disconnect) { + ret = xl->cbks->client_disconnect (xl, client); + } + + for (trav = xl->children; trav; trav = trav->next) { + ret |= gf_client_disconnect_recursive (trav->xlator, client); + } + + return ret; +} + int gf_client_disconnect (client_t *client) { int ret = 0; glusterfs_graph_t *gtrav = NULL; - xlator_t *xtrav = NULL; list_for_each_entry (gtrav, &client->this->ctx->graphs, list) { - xtrav = gtrav->top; - while (xtrav != NULL) { - if (xtrav->cbks->client_disconnect != NULL) - if (xtrav->cbks->client_disconnect (xtrav, client) != 0) - ret = -1; - xtrav = xtrav->next; - } + ret |= gf_client_disconnect_recursive (gtrav->top, client); } return ret; diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 0486409a849..e180dd3eec0 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -3646,15 +3646,17 @@ gf_is_service_running (char *pidfile, int *pid) int fno = 0; file = fopen (pidfile, "r+"); - if (!file) + if (!file) { goto out; + } fno = fileno (file); ret = lockf (fno, F_TEST, 0); if (ret == -1) running = _gf_true; - if (!pid) + if (!pid) { goto out; + } ret = fscanf (file, "%d", pid); if (ret <= 0) { @@ -3663,6 +3665,15 @@ gf_is_service_running (char *pidfile, int *pid) *pid = -1; } + if (!*pid) { + /* + * PID 0 means we've started the process, but it hasn't gotten + * far enough to put in a real PID yet. More details are in + * glusterd_brick_start. + */ + running = _gf_true; + } + out: if (file) fclose (file); diff --git a/libglusterfs/src/event-epoll.c b/libglusterfs/src/event-epoll.c index 3fd580d9d1a..e2b40602e7a 100644 --- a/libglusterfs/src/event-epoll.c +++ b/libglusterfs/src/event-epoll.c @@ -263,6 +263,7 @@ event_pool_new_epoll (int count, int eventthreadcount) event_pool->count = count; event_pool->eventthreadcount = eventthreadcount; + event_pool->auto_thread_count = 0; pthread_mutex_init (&event_pool->mutex, NULL); @@ -363,7 +364,7 @@ event_register_epoll (struct event_pool *event_pool, int fd, time as well. */ - slot->events = EPOLLPRI | EPOLLONESHOT; + slot->events = EPOLLPRI | EPOLLHUP | EPOLLERR | EPOLLONESHOT; slot->handler = handler; slot->data = data; diff --git a/libglusterfs/src/event.h b/libglusterfs/src/event.h index b01ef24bb8e..1348f5d05c0 100644 --- a/libglusterfs/src/event.h +++ b/libglusterfs/src/event.h @@ -28,7 +28,7 @@ typedef int (*event_handler_t) (int fd, int idx, void *data, #define EVENT_EPOLL_TABLES 1024 #define EVENT_EPOLL_SLOTS 1024 -#define EVENT_MAX_THREADS 32 +#define EVENT_MAX_THREADS 1024 struct event_pool { struct event_ops *ops; @@ -57,6 +57,20 @@ struct event_pool { * and live status */ int destroy; int activethreadcount; + + /* + * Number of threads created by auto-scaling, *in addition to* the + * configured number of threads. This is only applicable on the + * server, where we try to keep the number of threads around the number + * of bricks. In that case, the configured number is just "extra" + * threads to handle requests in excess of one per brick (including + * requests on the GlusterD connection). For clients or GlusterD, this + * number will always be zero, so the "extra" is all we have. + * + * TBD: consider auto-scaling for clients as well + */ + int auto_thread_count; + }; struct event_ops { diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 0d073154934..4f1f27b5857 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -557,16 +557,19 @@ typedef struct lock_migration_info { */ #define SECURE_ACCESS_FILE GLUSTERD_DEFAULT_WORKDIR "/secure-access" -int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx); +int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, + char *volume_name); int glusterfs_graph_destroy_residual (glusterfs_graph_t *graph); int glusterfs_graph_deactivate (glusterfs_graph_t *graph); int glusterfs_graph_destroy (glusterfs_graph_t *graph); int glusterfs_get_leaf_count (glusterfs_graph_t *graph); int glusterfs_graph_activate (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx); glusterfs_graph_t *glusterfs_graph_construct (FILE *fp); +int glusterfs_graph_init (glusterfs_graph_t *graph); glusterfs_graph_t *glusterfs_graph_new (void); int glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph, glusterfs_graph_t *newgraph); +int glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path); void gf_free_mig_locks (lock_migration_info_t *locks); diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c index 04bb92c7c75..b090f8a3554 100644 --- a/libglusterfs/src/graph.c +++ b/libglusterfs/src/graph.c @@ -407,13 +407,11 @@ fill_uuid (char *uuid, int size) int -glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) +glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, + char *volume_name) { - const char *volume_name = NULL; xlator_t *trav = NULL; - volume_name = ctx->cmd_args.volume_name; - if (!volume_name) { graph->top = graph->first; return 0; @@ -454,7 +452,8 @@ glusterfs_graph_parent_up (glusterfs_graph_t *graph) int -glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) +glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, + char *volume_name) { xlator_t *trav = NULL; int ret = 0; @@ -462,12 +461,20 @@ glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) /* XXX: CHECKSUM */ /* XXX: attach to -n volname */ - ret = glusterfs_graph_settop (graph, ctx); + ret = glusterfs_graph_settop (graph, ctx, volume_name); if (ret) { + char *slash = rindex (volume_name, '/'); + if (slash) { + ret = glusterfs_graph_settop (graph, ctx, slash + 1); + if (!ret) { + goto ok; + } + } gf_msg ("graph", GF_LOG_ERROR, 0, LG_MSG_GRAPH_ERROR, "glusterfs graph settop failed"); return -1; } +ok: /* XXX: WORM VOLUME */ ret = glusterfs_graph_worm (graph, ctx); @@ -749,7 +756,7 @@ xlator_equal_rec (xlator_t *xl1, xlator_t *xl2) } /* type could have changed even if xlator names match, - e.g cluster/distrubte and cluster/nufa share the same + e.g cluster/distribute and cluster/nufa share the same xlator name */ if (strcmp (xl1->type, xl2->type)) { @@ -764,13 +771,27 @@ out : gf_boolean_t is_graph_topology_equal (glusterfs_graph_t *graph1, glusterfs_graph_t *graph2) { - xlator_t *trav1 = NULL; - xlator_t *trav2 = NULL; - gf_boolean_t ret = _gf_true; + xlator_t *trav1 = NULL; + xlator_t *trav2 = NULL; + gf_boolean_t ret = _gf_true; + xlator_list_t *ltrav; trav1 = graph1->first; trav2 = graph2->first; + if (strcmp (trav2->type, "protocol/server") == 0) { + trav2 = trav2->children->xlator; + for (ltrav = trav1->children; ltrav; ltrav = ltrav->next) { + trav1 = ltrav->xlator; + if (strcmp (trav1->name, trav2->name) == 0) { + break; + } + } + if (!ltrav) { + return _gf_false; + } + } + ret = xlator_equal_rec (trav1, trav2); if (ret) { @@ -869,7 +890,8 @@ glusterfs_volfile_reconfigure (int oldvollen, FILE *newvolfile_fp, goto out; } - glusterfs_graph_prepare (newvolfile_graph, ctx); + glusterfs_graph_prepare (newvolfile_graph, ctx, + ctx->cmd_args.volume_name); if (!is_graph_topology_equal (oldvolfile_graph, newvolfile_graph)) { @@ -917,8 +939,9 @@ int glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph, glusterfs_graph_t *newgraph) { - xlator_t *old_xl = NULL; - xlator_t *new_xl = NULL; + xlator_t *old_xl = NULL; + xlator_t *new_xl = NULL; + xlator_list_t *trav; GF_ASSERT (oldgraph); GF_ASSERT (newgraph); @@ -933,7 +956,25 @@ glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph, new_xl = new_xl->children->xlator; } - return xlator_tree_reconfigure (old_xl, new_xl); + if (strcmp (old_xl->type, "protocol/server") != 0) { + return xlator_tree_reconfigure (old_xl, new_xl); + } + + /* Some options still need to be handled by the server translator. */ + if (old_xl->reconfigure) { + old_xl->reconfigure (old_xl, new_xl->options); + } + + (void) copy_opts_to_child (new_xl, FIRST_CHILD (new_xl), "*auth*"); + new_xl = FIRST_CHILD (new_xl); + + for (trav = old_xl->children; trav; trav = trav->next) { + if (strcmp (trav->xlator->name, new_xl->name) == 0) { + return xlator_tree_reconfigure (trav->xlator, new_xl); + } + } + + return -1; } int @@ -987,3 +1028,61 @@ glusterfs_graph_destroy (glusterfs_graph_t *graph) out: return ret; } + + +int +glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path) +{ + xlator_t *this = THIS; + FILE *fp; + glusterfs_graph_t *graph; + xlator_t *xl; + char *volfile_id; + + fp = fopen (path, "r"); + if (!fp) { + gf_log (THIS->name, GF_LOG_WARNING, + "oops, %s disappeared on us", path); + return -EIO; + } + + graph = glusterfs_graph_construct (fp); + fclose(fp); + if (!graph) { + gf_log (this->name, GF_LOG_WARNING, + "could not create graph from %s", path); + return -EIO; + } + + /* + * If there's a server translator on top, we want whatever's below + * that. + */ + xl = graph->first; + if (strcmp(xl->type, "protocol/server") == 0) { + (void) copy_opts_to_child (xl, FIRST_CHILD (xl), "*auth*"); + xl = FIRST_CHILD(xl); + } + graph->first = xl; + + + volfile_id = strstr (path, "/snaps/"); + if (!volfile_id) { + volfile_id = rindex (path, '/'); + if (volfile_id) { + ++volfile_id; + } + } + if (volfile_id) { + xl->volfile_id = gf_strdup (volfile_id); + /* There's a stray ".vol" at the end. */ + xl->volfile_id[strlen(xl->volfile_id)-4] = '\0'; + } + + /* TBD: memory leaks everywhere */ + glusterfs_graph_prepare (graph, this->ctx, xl->name); + glusterfs_graph_init (graph); + glusterfs_xlator_link (orig_graph->top, graph->top); + + return 0; +} diff --git a/libglusterfs/src/locking.c b/libglusterfs/src/locking.c index d3b9754ef76..f27b0d05b35 100644 --- a/libglusterfs/src/locking.c +++ b/libglusterfs/src/locking.c @@ -22,7 +22,7 @@ int use_spinlocks = 0; static void __attribute__((constructor)) gf_lock_setup (void) { - use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1); + //use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1); } #endif diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 2edebc0aec2..4702ea3eb77 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -406,6 +406,59 @@ out: return search; } + +/* + * With brick multiplexing, we sort of have multiple graphs, so + * xlator_search_by_name might not find what we want. Also, the translator + * we're looking for might not be a direct child if something else was put in + * between (as already happened with decompounder before that was fixed) and + * it's hard to debug why our translator wasn't found. Using a recursive tree + * search instead of a linear search works around both problems. + */ +static xlator_t * +get_xlator_by_name_or_type (xlator_t *this, char *target, int is_name) +{ + xlator_list_t *trav; + xlator_t *child_xl; + char *value; + + for (trav = this->children; trav; trav = trav->next) { + value = is_name ? trav->xlator->name : trav->xlator->type; + if (strcmp(value, target) == 0) { + return trav->xlator; + } + child_xl = get_xlator_by_name_or_type (trav->xlator, target, + is_name); + if (child_xl) { + /* + * If the xlator we're looking for is somewhere down + * the stack, get_xlator_by_name expects to get a + * pointer to the top of its subtree (child of "this") + * while get_xlator_by_type expects a pointer to what + * we actually found. Handle both cases here. + * + * TBD: rename the functions and fix callers to better + * reflect the difference in semantics. + */ + return is_name ? trav->xlator : child_xl; + } + } + + return NULL; +} + +xlator_t * +get_xlator_by_name (xlator_t *this, char *target) +{ + return get_xlator_by_name_or_type (this, target, 1); +} + +xlator_t * +get_xlator_by_type (xlator_t *this, char *target) +{ + return get_xlator_by_name_or_type (this, target, 0); +} + static int __xlator_init(xlator_t *xl) { @@ -1104,3 +1157,22 @@ xlator_subvolume_count (xlator_t *this) i++; return i; } + +static int +_copy_opt_to_child (dict_t *options, char *key, data_t *value, void *data) +{ + xlator_t *child = data; + + gf_log (__func__, GF_LOG_DEBUG, + "copying %s to child %s", key, child->name); + dict_set (child->options, key, value); + + return 0; +} + +int +copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob) +{ + return dict_foreach_fnmatch (src->options, glob, + _copy_opt_to_child, dst); +} diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index e28790cc034..1e2698bb61f 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -950,6 +950,9 @@ struct _xlator { /* for the memory pool of 'frame->local' */ struct mem_pool *local_pool; gf_boolean_t is_autoloaded; + + /* Saved volfile ID (used for multiplexing) */ + char *volfile_id; }; typedef struct { @@ -1004,6 +1007,8 @@ void xlator_foreach_depth_first (xlator_t *this, void *data); xlator_t *xlator_search_by_name (xlator_t *any, const char *name); +xlator_t *get_xlator_by_name (xlator_t *this, char *target); +xlator_t *get_xlator_by_type (xlator_t *this, char *target); void xlator_set_inode_lru_limit (xlator_t *this, void *data); @@ -1050,5 +1055,7 @@ xlator_subvolume_count (xlator_t *this); void xlator_init_lock (void); void xlator_init_unlock (void); +int +copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob); #endif /* _XLATOR_H */ |