summaryrefslogtreecommitdiffstats
path: root/libglusterfs
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2016-12-08 16:24:15 -0500
committerVijay Bellur <vbellur@redhat.com>2017-01-30 19:13:58 -0500
commit1a95fc3036db51b82b6a80952f0908bc2019d24a (patch)
treeb983ac196a8165d5cb5e860a5ef97d3e9a41b5c9 /libglusterfs
parent7f7d7a939e46b330a084d974451eee4757ba61b4 (diff)
core: run many bricks within one glusterfsd process
This patch adds support for multiple brick translator stacks running in a single brick server process. This reduces our per-brick memory usage by approximately 3x, and our appetite for TCP ports even more. It also creates potential to avoid process/thread thrashing, and to improve QoS by scheduling more carefully across the bricks, but realizing that potential will require further work. Multiplexing is controlled by the "cluster.brick-multiplex" global option. By default it's off, and bricks are started in separate processes as before. If multiplexing is enabled, then *compatible* bricks (mostly those with the same transport options) will be started in the same process. Change-Id: I45059454e51d6f4cbb29a4953359c09a408695cb BUG: 1385758 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: https://review.gluster.org/14763 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r--libglusterfs/src/client_t.c49
-rw-r--r--libglusterfs/src/common-utils.c15
-rw-r--r--libglusterfs/src/event-epoll.c3
-rw-r--r--libglusterfs/src/event.h16
-rw-r--r--libglusterfs/src/glusterfs.h5
-rw-r--r--libglusterfs/src/graph.c127
-rw-r--r--libglusterfs/src/locking.c2
-rw-r--r--libglusterfs/src/xlator.c72
-rw-r--r--libglusterfs/src/xlator.h7
9 files changed, 261 insertions, 35 deletions
diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c
index b3eb4e4df8c..c20c4089ec3 100644
--- a/libglusterfs/src/client_t.c
+++ b/libglusterfs/src/client_t.c
@@ -331,11 +331,25 @@ gf_client_ref (client_t *client)
static void
+gf_client_destroy_recursive (xlator_t *xl, client_t *client)
+{
+ xlator_list_t *trav;
+
+ if (xl->cbks->client_destroy) {
+ xl->cbks->client_destroy (xl, client);
+ }
+
+ for (trav = xl->children; trav; trav = trav->next) {
+ gf_client_destroy_recursive (trav->xlator, client);
+ }
+}
+
+
+static void
client_destroy (client_t *client)
{
clienttable_t *clienttable = NULL;
glusterfs_graph_t *gtrav = NULL;
- xlator_t *xtrav = NULL;
if (client == NULL){
gf_msg_callingfn ("xlator", GF_LOG_ERROR, EINVAL,
@@ -358,12 +372,7 @@ client_destroy (client_t *client)
UNLOCK (&clienttable->lock);
list_for_each_entry (gtrav, &client->this->ctx->graphs, list) {
- xtrav = gtrav->top;
- while (xtrav != NULL) {
- if (xtrav->cbks->client_destroy != NULL)
- xtrav->cbks->client_destroy (xtrav, client);
- xtrav = xtrav->next;
- }
+ gf_client_destroy_recursive (gtrav->top, client);
}
GF_FREE (client->auth.data);
GF_FREE (client->auth.username);
@@ -375,22 +384,32 @@ out:
return;
}
+static int
+gf_client_disconnect_recursive (xlator_t *xl, client_t *client)
+{
+ int ret = 0;
+ xlator_list_t *trav;
+
+ if (xl->cbks->client_disconnect) {
+ ret = xl->cbks->client_disconnect (xl, client);
+ }
+
+ for (trav = xl->children; trav; trav = trav->next) {
+ ret |= gf_client_disconnect_recursive (trav->xlator, client);
+ }
+
+ return ret;
+}
+
int
gf_client_disconnect (client_t *client)
{
int ret = 0;
glusterfs_graph_t *gtrav = NULL;
- xlator_t *xtrav = NULL;
list_for_each_entry (gtrav, &client->this->ctx->graphs, list) {
- xtrav = gtrav->top;
- while (xtrav != NULL) {
- if (xtrav->cbks->client_disconnect != NULL)
- if (xtrav->cbks->client_disconnect (xtrav, client) != 0)
- ret = -1;
- xtrav = xtrav->next;
- }
+ ret |= gf_client_disconnect_recursive (gtrav->top, client);
}
return ret;
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 0486409a849..e180dd3eec0 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -3646,15 +3646,17 @@ gf_is_service_running (char *pidfile, int *pid)
int fno = 0;
file = fopen (pidfile, "r+");
- if (!file)
+ if (!file) {
goto out;
+ }
fno = fileno (file);
ret = lockf (fno, F_TEST, 0);
if (ret == -1)
running = _gf_true;
- if (!pid)
+ if (!pid) {
goto out;
+ }
ret = fscanf (file, "%d", pid);
if (ret <= 0) {
@@ -3663,6 +3665,15 @@ gf_is_service_running (char *pidfile, int *pid)
*pid = -1;
}
+ if (!*pid) {
+ /*
+ * PID 0 means we've started the process, but it hasn't gotten
+ * far enough to put in a real PID yet. More details are in
+ * glusterd_brick_start.
+ */
+ running = _gf_true;
+ }
+
out:
if (file)
fclose (file);
diff --git a/libglusterfs/src/event-epoll.c b/libglusterfs/src/event-epoll.c
index 3fd580d9d1a..e2b40602e7a 100644
--- a/libglusterfs/src/event-epoll.c
+++ b/libglusterfs/src/event-epoll.c
@@ -263,6 +263,7 @@ event_pool_new_epoll (int count, int eventthreadcount)
event_pool->count = count;
event_pool->eventthreadcount = eventthreadcount;
+ event_pool->auto_thread_count = 0;
pthread_mutex_init (&event_pool->mutex, NULL);
@@ -363,7 +364,7 @@ event_register_epoll (struct event_pool *event_pool, int fd,
time as well.
*/
- slot->events = EPOLLPRI | EPOLLONESHOT;
+ slot->events = EPOLLPRI | EPOLLHUP | EPOLLERR | EPOLLONESHOT;
slot->handler = handler;
slot->data = data;
diff --git a/libglusterfs/src/event.h b/libglusterfs/src/event.h
index b01ef24bb8e..1348f5d05c0 100644
--- a/libglusterfs/src/event.h
+++ b/libglusterfs/src/event.h
@@ -28,7 +28,7 @@ typedef int (*event_handler_t) (int fd, int idx, void *data,
#define EVENT_EPOLL_TABLES 1024
#define EVENT_EPOLL_SLOTS 1024
-#define EVENT_MAX_THREADS 32
+#define EVENT_MAX_THREADS 1024
struct event_pool {
struct event_ops *ops;
@@ -57,6 +57,20 @@ struct event_pool {
* and live status */
int destroy;
int activethreadcount;
+
+ /*
+ * Number of threads created by auto-scaling, *in addition to* the
+ * configured number of threads. This is only applicable on the
+ * server, where we try to keep the number of threads around the number
+ * of bricks. In that case, the configured number is just "extra"
+ * threads to handle requests in excess of one per brick (including
+ * requests on the GlusterD connection). For clients or GlusterD, this
+ * number will always be zero, so the "extra" is all we have.
+ *
+ * TBD: consider auto-scaling for clients as well
+ */
+ int auto_thread_count;
+
};
struct event_ops {
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 0d073154934..4f1f27b5857 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -557,16 +557,19 @@ typedef struct lock_migration_info {
*/
#define SECURE_ACCESS_FILE GLUSTERD_DEFAULT_WORKDIR "/secure-access"
-int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx);
+int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx,
+ char *volume_name);
int glusterfs_graph_destroy_residual (glusterfs_graph_t *graph);
int glusterfs_graph_deactivate (glusterfs_graph_t *graph);
int glusterfs_graph_destroy (glusterfs_graph_t *graph);
int glusterfs_get_leaf_count (glusterfs_graph_t *graph);
int glusterfs_graph_activate (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx);
glusterfs_graph_t *glusterfs_graph_construct (FILE *fp);
+int glusterfs_graph_init (glusterfs_graph_t *graph);
glusterfs_graph_t *glusterfs_graph_new (void);
int glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,
glusterfs_graph_t *newgraph);
+int glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path);
void
gf_free_mig_locks (lock_migration_info_t *locks);
diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c
index 04bb92c7c75..b090f8a3554 100644
--- a/libglusterfs/src/graph.c
+++ b/libglusterfs/src/graph.c
@@ -407,13 +407,11 @@ fill_uuid (char *uuid, int size)
int
-glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx)
+glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx,
+ char *volume_name)
{
- const char *volume_name = NULL;
xlator_t *trav = NULL;
- volume_name = ctx->cmd_args.volume_name;
-
if (!volume_name) {
graph->top = graph->first;
return 0;
@@ -454,7 +452,8 @@ glusterfs_graph_parent_up (glusterfs_graph_t *graph)
int
-glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx)
+glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx,
+ char *volume_name)
{
xlator_t *trav = NULL;
int ret = 0;
@@ -462,12 +461,20 @@ glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx)
/* XXX: CHECKSUM */
/* XXX: attach to -n volname */
- ret = glusterfs_graph_settop (graph, ctx);
+ ret = glusterfs_graph_settop (graph, ctx, volume_name);
if (ret) {
+ char *slash = rindex (volume_name, '/');
+ if (slash) {
+ ret = glusterfs_graph_settop (graph, ctx, slash + 1);
+ if (!ret) {
+ goto ok;
+ }
+ }
gf_msg ("graph", GF_LOG_ERROR, 0, LG_MSG_GRAPH_ERROR,
"glusterfs graph settop failed");
return -1;
}
+ok:
/* XXX: WORM VOLUME */
ret = glusterfs_graph_worm (graph, ctx);
@@ -749,7 +756,7 @@ xlator_equal_rec (xlator_t *xl1, xlator_t *xl2)
}
/* type could have changed even if xlator names match,
- e.g cluster/distrubte and cluster/nufa share the same
+ e.g cluster/distribute and cluster/nufa share the same
xlator name
*/
if (strcmp (xl1->type, xl2->type)) {
@@ -764,13 +771,27 @@ out :
gf_boolean_t
is_graph_topology_equal (glusterfs_graph_t *graph1, glusterfs_graph_t *graph2)
{
- xlator_t *trav1 = NULL;
- xlator_t *trav2 = NULL;
- gf_boolean_t ret = _gf_true;
+ xlator_t *trav1 = NULL;
+ xlator_t *trav2 = NULL;
+ gf_boolean_t ret = _gf_true;
+ xlator_list_t *ltrav;
trav1 = graph1->first;
trav2 = graph2->first;
+ if (strcmp (trav2->type, "protocol/server") == 0) {
+ trav2 = trav2->children->xlator;
+ for (ltrav = trav1->children; ltrav; ltrav = ltrav->next) {
+ trav1 = ltrav->xlator;
+ if (strcmp (trav1->name, trav2->name) == 0) {
+ break;
+ }
+ }
+ if (!ltrav) {
+ return _gf_false;
+ }
+ }
+
ret = xlator_equal_rec (trav1, trav2);
if (ret) {
@@ -869,7 +890,8 @@ glusterfs_volfile_reconfigure (int oldvollen, FILE *newvolfile_fp,
goto out;
}
- glusterfs_graph_prepare (newvolfile_graph, ctx);
+ glusterfs_graph_prepare (newvolfile_graph, ctx,
+ ctx->cmd_args.volume_name);
if (!is_graph_topology_equal (oldvolfile_graph,
newvolfile_graph)) {
@@ -917,8 +939,9 @@ int
glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,
glusterfs_graph_t *newgraph)
{
- xlator_t *old_xl = NULL;
- xlator_t *new_xl = NULL;
+ xlator_t *old_xl = NULL;
+ xlator_t *new_xl = NULL;
+ xlator_list_t *trav;
GF_ASSERT (oldgraph);
GF_ASSERT (newgraph);
@@ -933,7 +956,25 @@ glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,
new_xl = new_xl->children->xlator;
}
- return xlator_tree_reconfigure (old_xl, new_xl);
+ if (strcmp (old_xl->type, "protocol/server") != 0) {
+ return xlator_tree_reconfigure (old_xl, new_xl);
+ }
+
+ /* Some options still need to be handled by the server translator. */
+ if (old_xl->reconfigure) {
+ old_xl->reconfigure (old_xl, new_xl->options);
+ }
+
+ (void) copy_opts_to_child (new_xl, FIRST_CHILD (new_xl), "*auth*");
+ new_xl = FIRST_CHILD (new_xl);
+
+ for (trav = old_xl->children; trav; trav = trav->next) {
+ if (strcmp (trav->xlator->name, new_xl->name) == 0) {
+ return xlator_tree_reconfigure (trav->xlator, new_xl);
+ }
+ }
+
+ return -1;
}
int
@@ -987,3 +1028,61 @@ glusterfs_graph_destroy (glusterfs_graph_t *graph)
out:
return ret;
}
+
+
+int
+glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path)
+{
+ xlator_t *this = THIS;
+ FILE *fp;
+ glusterfs_graph_t *graph;
+ xlator_t *xl;
+ char *volfile_id;
+
+ fp = fopen (path, "r");
+ if (!fp) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "oops, %s disappeared on us", path);
+ return -EIO;
+ }
+
+ graph = glusterfs_graph_construct (fp);
+ fclose(fp);
+ if (!graph) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not create graph from %s", path);
+ return -EIO;
+ }
+
+ /*
+ * If there's a server translator on top, we want whatever's below
+ * that.
+ */
+ xl = graph->first;
+ if (strcmp(xl->type, "protocol/server") == 0) {
+ (void) copy_opts_to_child (xl, FIRST_CHILD (xl), "*auth*");
+ xl = FIRST_CHILD(xl);
+ }
+ graph->first = xl;
+
+
+ volfile_id = strstr (path, "/snaps/");
+ if (!volfile_id) {
+ volfile_id = rindex (path, '/');
+ if (volfile_id) {
+ ++volfile_id;
+ }
+ }
+ if (volfile_id) {
+ xl->volfile_id = gf_strdup (volfile_id);
+ /* There's a stray ".vol" at the end. */
+ xl->volfile_id[strlen(xl->volfile_id)-4] = '\0';
+ }
+
+ /* TBD: memory leaks everywhere */
+ glusterfs_graph_prepare (graph, this->ctx, xl->name);
+ glusterfs_graph_init (graph);
+ glusterfs_xlator_link (orig_graph->top, graph->top);
+
+ return 0;
+}
diff --git a/libglusterfs/src/locking.c b/libglusterfs/src/locking.c
index d3b9754ef76..f27b0d05b35 100644
--- a/libglusterfs/src/locking.c
+++ b/libglusterfs/src/locking.c
@@ -22,7 +22,7 @@ int use_spinlocks = 0;
static void __attribute__((constructor))
gf_lock_setup (void)
{
- use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1);
+ //use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1);
}
#endif
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index 2edebc0aec2..4702ea3eb77 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -406,6 +406,59 @@ out:
return search;
}
+
+/*
+ * With brick multiplexing, we sort of have multiple graphs, so
+ * xlator_search_by_name might not find what we want. Also, the translator
+ * we're looking for might not be a direct child if something else was put in
+ * between (as already happened with decompounder before that was fixed) and
+ * it's hard to debug why our translator wasn't found. Using a recursive tree
+ * search instead of a linear search works around both problems.
+ */
+static xlator_t *
+get_xlator_by_name_or_type (xlator_t *this, char *target, int is_name)
+{
+ xlator_list_t *trav;
+ xlator_t *child_xl;
+ char *value;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ value = is_name ? trav->xlator->name : trav->xlator->type;
+ if (strcmp(value, target) == 0) {
+ return trav->xlator;
+ }
+ child_xl = get_xlator_by_name_or_type (trav->xlator, target,
+ is_name);
+ if (child_xl) {
+ /*
+ * If the xlator we're looking for is somewhere down
+ * the stack, get_xlator_by_name expects to get a
+ * pointer to the top of its subtree (child of "this")
+ * while get_xlator_by_type expects a pointer to what
+ * we actually found. Handle both cases here.
+ *
+ * TBD: rename the functions and fix callers to better
+ * reflect the difference in semantics.
+ */
+ return is_name ? trav->xlator : child_xl;
+ }
+ }
+
+ return NULL;
+}
+
+xlator_t *
+get_xlator_by_name (xlator_t *this, char *target)
+{
+ return get_xlator_by_name_or_type (this, target, 1);
+}
+
+xlator_t *
+get_xlator_by_type (xlator_t *this, char *target)
+{
+ return get_xlator_by_name_or_type (this, target, 0);
+}
+
static int
__xlator_init(xlator_t *xl)
{
@@ -1104,3 +1157,22 @@ xlator_subvolume_count (xlator_t *this)
i++;
return i;
}
+
+static int
+_copy_opt_to_child (dict_t *options, char *key, data_t *value, void *data)
+{
+ xlator_t *child = data;
+
+ gf_log (__func__, GF_LOG_DEBUG,
+ "copying %s to child %s", key, child->name);
+ dict_set (child->options, key, value);
+
+ return 0;
+}
+
+int
+copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob)
+{
+ return dict_foreach_fnmatch (src->options, glob,
+ _copy_opt_to_child, dst);
+}
diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h
index e28790cc034..1e2698bb61f 100644
--- a/libglusterfs/src/xlator.h
+++ b/libglusterfs/src/xlator.h
@@ -950,6 +950,9 @@ struct _xlator {
/* for the memory pool of 'frame->local' */
struct mem_pool *local_pool;
gf_boolean_t is_autoloaded;
+
+ /* Saved volfile ID (used for multiplexing) */
+ char *volfile_id;
};
typedef struct {
@@ -1004,6 +1007,8 @@ void xlator_foreach_depth_first (xlator_t *this,
void *data);
xlator_t *xlator_search_by_name (xlator_t *any, const char *name);
+xlator_t *get_xlator_by_name (xlator_t *this, char *target);
+xlator_t *get_xlator_by_type (xlator_t *this, char *target);
void
xlator_set_inode_lru_limit (xlator_t *this, void *data);
@@ -1050,5 +1055,7 @@ xlator_subvolume_count (xlator_t *this);
void xlator_init_lock (void);
void xlator_init_unlock (void);
+int
+copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob);
#endif /* _XLATOR_H */