diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2016-12-08 16:24:15 -0500 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2017-01-30 19:13:58 -0500 | 
| commit | 1a95fc3036db51b82b6a80952f0908bc2019d24a (patch) | |
| tree | b983ac196a8165d5cb5e860a5ef97d3e9a41b5c9 /libglusterfs | |
| parent | 7f7d7a939e46b330a084d974451eee4757ba61b4 (diff) | |
core: run many bricks within one glusterfsd process
This patch adds support for multiple brick translator stacks running
in a single brick server process.  This reduces our per-brick memory usage by
approximately 3x, and our appetite for TCP ports even more.  It also creates
potential to avoid process/thread thrashing, and to improve QoS by scheduling
more carefully across the bricks, but realizing that potential will require
further work.
Multiplexing is controlled by the "cluster.brick-multiplex" global option.  By
default it's off, and bricks are started in separate processes as before.  If
multiplexing is enabled, then *compatible* bricks (mostly those with the same
transport options) will be started in the same process.
Change-Id: I45059454e51d6f4cbb29a4953359c09a408695cb
BUG: 1385758
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://review.gluster.org/14763
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'libglusterfs')
| -rw-r--r-- | libglusterfs/src/client_t.c | 49 | ||||
| -rw-r--r-- | libglusterfs/src/common-utils.c | 15 | ||||
| -rw-r--r-- | libglusterfs/src/event-epoll.c | 3 | ||||
| -rw-r--r-- | libglusterfs/src/event.h | 16 | ||||
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 5 | ||||
| -rw-r--r-- | libglusterfs/src/graph.c | 127 | ||||
| -rw-r--r-- | libglusterfs/src/locking.c | 2 | ||||
| -rw-r--r-- | libglusterfs/src/xlator.c | 72 | ||||
| -rw-r--r-- | libglusterfs/src/xlator.h | 7 | 
9 files changed, 261 insertions, 35 deletions
diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c index b3eb4e4df8c..c20c4089ec3 100644 --- a/libglusterfs/src/client_t.c +++ b/libglusterfs/src/client_t.c @@ -331,11 +331,25 @@ gf_client_ref (client_t *client)  static void +gf_client_destroy_recursive (xlator_t *xl, client_t *client) +{ +        xlator_list_t   *trav; + +        if (xl->cbks->client_destroy) { +                xl->cbks->client_destroy (xl, client); +        } + +        for (trav = xl->children; trav; trav = trav->next) { +                gf_client_destroy_recursive (trav->xlator, client); +        } +} + + +static void  client_destroy (client_t *client)  {          clienttable_t     *clienttable = NULL;          glusterfs_graph_t *gtrav       = NULL; -        xlator_t          *xtrav       = NULL;          if (client == NULL){                  gf_msg_callingfn ("xlator", GF_LOG_ERROR, EINVAL, @@ -358,12 +372,7 @@ client_destroy (client_t *client)          UNLOCK (&clienttable->lock);          list_for_each_entry (gtrav, &client->this->ctx->graphs, list) { -                xtrav = gtrav->top; -                while (xtrav != NULL) { -                        if (xtrav->cbks->client_destroy != NULL) -                                xtrav->cbks->client_destroy (xtrav, client); -                        xtrav = xtrav->next; -                } +                gf_client_destroy_recursive (gtrav->top, client);          }          GF_FREE (client->auth.data);          GF_FREE (client->auth.username); @@ -375,22 +384,32 @@ out:          return;  } +static int +gf_client_disconnect_recursive (xlator_t *xl, client_t *client) +{ +        int             ret     = 0; +        xlator_list_t   *trav; + +        if (xl->cbks->client_disconnect) { +                ret = xl->cbks->client_disconnect (xl, client); +        } + +        for (trav = xl->children; trav; trav = trav->next) { +                ret |= gf_client_disconnect_recursive (trav->xlator, client); +        } + +        return ret; +} +  int  gf_client_disconnect (client_t *client)  {          int                ret   = 0;          glusterfs_graph_t *gtrav = NULL; -        xlator_t          *xtrav = NULL;          list_for_each_entry (gtrav, &client->this->ctx->graphs, list) { -                xtrav = gtrav->top; -                while (xtrav != NULL) { -                        if (xtrav->cbks->client_disconnect != NULL) -                                if (xtrav->cbks->client_disconnect (xtrav, client) != 0) -                                        ret = -1; -                        xtrav = xtrav->next; -                } +                ret |= gf_client_disconnect_recursive (gtrav->top, client);          }          return ret; diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 0486409a849..e180dd3eec0 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -3646,15 +3646,17 @@ gf_is_service_running (char *pidfile, int *pid)          int             fno = 0;          file = fopen (pidfile, "r+"); -        if (!file) +        if (!file) {                  goto out; +        }          fno = fileno (file);          ret = lockf (fno, F_TEST, 0);          if (ret == -1)                  running = _gf_true; -        if (!pid) +        if (!pid) {                  goto out; +        }          ret = fscanf (file, "%d", pid);          if (ret <= 0) { @@ -3663,6 +3665,15 @@ gf_is_service_running (char *pidfile, int *pid)                  *pid = -1;          } +        if (!*pid) { +                /* +                 * PID 0 means we've started the process, but it hasn't gotten +                 * far enough to put in a real PID yet.  More details are in +                 * glusterd_brick_start. +                 */ +                running = _gf_true; +        } +  out:          if (file)                  fclose (file); diff --git a/libglusterfs/src/event-epoll.c b/libglusterfs/src/event-epoll.c index 3fd580d9d1a..e2b40602e7a 100644 --- a/libglusterfs/src/event-epoll.c +++ b/libglusterfs/src/event-epoll.c @@ -263,6 +263,7 @@ event_pool_new_epoll (int count, int eventthreadcount)          event_pool->count = count;          event_pool->eventthreadcount = eventthreadcount; +        event_pool->auto_thread_count = 0;          pthread_mutex_init (&event_pool->mutex, NULL); @@ -363,7 +364,7 @@ event_register_epoll (struct event_pool *event_pool, int fd,  		   time as well.  		*/ -		slot->events = EPOLLPRI | EPOLLONESHOT; +		slot->events = EPOLLPRI | EPOLLHUP | EPOLLERR | EPOLLONESHOT;  		slot->handler = handler;  		slot->data = data; diff --git a/libglusterfs/src/event.h b/libglusterfs/src/event.h index b01ef24bb8e..1348f5d05c0 100644 --- a/libglusterfs/src/event.h +++ b/libglusterfs/src/event.h @@ -28,7 +28,7 @@ typedef int (*event_handler_t) (int fd, int idx, void *data,  #define EVENT_EPOLL_TABLES 1024  #define EVENT_EPOLL_SLOTS 1024 -#define EVENT_MAX_THREADS  32 +#define EVENT_MAX_THREADS  1024  struct event_pool {  	struct event_ops *ops; @@ -57,6 +57,20 @@ struct event_pool {                                                       * and live status */          int destroy;          int activethreadcount; + +        /* +         * Number of threads created by auto-scaling, *in addition to* the +         * configured number of threads.  This is only applicable on the +         * server, where we try to keep the number of threads around the number +         * of bricks.  In that case, the configured number is just "extra" +         * threads to handle requests in excess of one per brick (including +         * requests on the GlusterD connection).  For clients or GlusterD, this +         * number will always be zero, so the "extra" is all we have. +         * +         * TBD: consider auto-scaling for clients as well +         */ +        int auto_thread_count; +  };  struct event_ops { diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 0d073154934..4f1f27b5857 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -557,16 +557,19 @@ typedef struct lock_migration_info {   */  #define SECURE_ACCESS_FILE     GLUSTERD_DEFAULT_WORKDIR "/secure-access" -int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx); +int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, +                             char *volume_name);  int glusterfs_graph_destroy_residual (glusterfs_graph_t *graph);  int glusterfs_graph_deactivate (glusterfs_graph_t *graph);  int glusterfs_graph_destroy (glusterfs_graph_t *graph);  int glusterfs_get_leaf_count (glusterfs_graph_t *graph);  int glusterfs_graph_activate (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx);  glusterfs_graph_t *glusterfs_graph_construct (FILE *fp); +int glusterfs_graph_init (glusterfs_graph_t *graph);  glusterfs_graph_t *glusterfs_graph_new (void);  int glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,                                    glusterfs_graph_t *newgraph); +int glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path);  void  gf_free_mig_locks (lock_migration_info_t *locks); diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c index 04bb92c7c75..b090f8a3554 100644 --- a/libglusterfs/src/graph.c +++ b/libglusterfs/src/graph.c @@ -407,13 +407,11 @@ fill_uuid (char *uuid, int size)  int -glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) +glusterfs_graph_settop (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, +                        char *volume_name)  { -        const char *volume_name = NULL;          xlator_t   *trav = NULL; -        volume_name = ctx->cmd_args.volume_name; -          if (!volume_name) {                  graph->top = graph->first;                  return 0; @@ -454,7 +452,8 @@ glusterfs_graph_parent_up (glusterfs_graph_t *graph)  int -glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) +glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx, +                         char *volume_name)  {          xlator_t    *trav = NULL;          int          ret = 0; @@ -462,12 +461,20 @@ glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx)          /* XXX: CHECKSUM */          /* XXX: attach to -n volname */ -        ret = glusterfs_graph_settop (graph, ctx); +        ret = glusterfs_graph_settop (graph, ctx, volume_name);          if (ret) { +                char *slash = rindex (volume_name, '/'); +                if (slash) { +                        ret = glusterfs_graph_settop (graph, ctx, slash + 1); +                        if (!ret) { +                                goto ok; +                        } +                }                  gf_msg ("graph", GF_LOG_ERROR, 0, LG_MSG_GRAPH_ERROR,                          "glusterfs graph settop failed");                  return -1;          } +ok:          /* XXX: WORM VOLUME */          ret = glusterfs_graph_worm (graph, ctx); @@ -749,7 +756,7 @@ xlator_equal_rec (xlator_t *xl1, xlator_t *xl2)          }  	/* type could have changed even if xlator names match, -	   e.g cluster/distrubte and cluster/nufa share the same +	   e.g cluster/distribute and cluster/nufa share the same  	   xlator name  	*/          if (strcmp (xl1->type, xl2->type)) { @@ -764,13 +771,27 @@ out :  gf_boolean_t  is_graph_topology_equal (glusterfs_graph_t *graph1, glusterfs_graph_t *graph2)  { -        xlator_t    *trav1    = NULL; -        xlator_t    *trav2    = NULL; -        gf_boolean_t ret      = _gf_true; +        xlator_t      *trav1    = NULL; +        xlator_t      *trav2    = NULL; +        gf_boolean_t   ret      = _gf_true; +        xlator_list_t *ltrav;          trav1 = graph1->first;          trav2 = graph2->first; +        if (strcmp (trav2->type, "protocol/server") == 0) { +                trav2 = trav2->children->xlator; +                for (ltrav = trav1->children; ltrav; ltrav = ltrav->next) { +                        trav1 = ltrav->xlator; +                        if (strcmp (trav1->name, trav2->name) == 0) { +                                break; +                        } +                } +                if (!ltrav) { +                        return _gf_false; +                } +        } +          ret = xlator_equal_rec (trav1, trav2);          if (ret) { @@ -869,7 +890,8 @@ glusterfs_volfile_reconfigure (int oldvollen, FILE *newvolfile_fp,                  goto out;          } -	glusterfs_graph_prepare (newvolfile_graph, ctx); +	glusterfs_graph_prepare (newvolfile_graph, ctx, +                                 ctx->cmd_args.volume_name);          if (!is_graph_topology_equal (oldvolfile_graph,                                        newvolfile_graph)) { @@ -917,8 +939,9 @@ int  glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,                               glusterfs_graph_t *newgraph)  { -        xlator_t   *old_xl   = NULL; -        xlator_t   *new_xl   = NULL; +        xlator_t        *old_xl   = NULL; +        xlator_t        *new_xl   = NULL; +        xlator_list_t   *trav;          GF_ASSERT (oldgraph);          GF_ASSERT (newgraph); @@ -933,7 +956,25 @@ glusterfs_graph_reconfigure (glusterfs_graph_t *oldgraph,                  new_xl = new_xl->children->xlator;          } -        return xlator_tree_reconfigure (old_xl, new_xl); +        if (strcmp (old_xl->type, "protocol/server") != 0) { +                return xlator_tree_reconfigure (old_xl, new_xl); +        } + +        /* Some options still need to be handled by the server translator. */ +        if (old_xl->reconfigure) { +                old_xl->reconfigure (old_xl, new_xl->options); +        } + +        (void) copy_opts_to_child (new_xl, FIRST_CHILD (new_xl), "*auth*"); +        new_xl = FIRST_CHILD (new_xl); + +        for (trav = old_xl->children; trav; trav = trav->next) { +                if (strcmp (trav->xlator->name, new_xl->name) == 0) { +                        return xlator_tree_reconfigure (trav->xlator, new_xl); +                } +        } + +        return -1;  }  int @@ -987,3 +1028,61 @@ glusterfs_graph_destroy (glusterfs_graph_t *graph)  out:          return ret;  } + + +int +glusterfs_graph_attach (glusterfs_graph_t *orig_graph, char *path) +{ +        xlator_t                *this   = THIS; +        FILE                    *fp; +        glusterfs_graph_t       *graph; +        xlator_t                *xl; +        char                    *volfile_id; + +        fp = fopen (path, "r"); +        if (!fp) { +                gf_log (THIS->name, GF_LOG_WARNING, +                        "oops, %s disappeared on us", path); +                return -EIO; +        } + +        graph = glusterfs_graph_construct (fp); +        fclose(fp); +        if (!graph) { +                gf_log (this->name, GF_LOG_WARNING, +                        "could not create graph from %s", path); +                return -EIO; +        } + +        /* +         * If there's a server translator on top, we want whatever's below +         * that. +         */ +        xl = graph->first; +        if (strcmp(xl->type, "protocol/server") == 0) { +                (void) copy_opts_to_child (xl, FIRST_CHILD (xl), "*auth*"); +                xl = FIRST_CHILD(xl); +        } +        graph->first = xl; + + +        volfile_id = strstr (path, "/snaps/"); +        if (!volfile_id) { +                volfile_id = rindex (path, '/'); +                if (volfile_id) { +                        ++volfile_id; +                } +        } +        if (volfile_id) { +                xl->volfile_id = gf_strdup (volfile_id); +                /* There's a stray ".vol" at the end. */ +                xl->volfile_id[strlen(xl->volfile_id)-4] = '\0'; +        } + +        /* TBD: memory leaks everywhere */ +        glusterfs_graph_prepare (graph, this->ctx, xl->name); +        glusterfs_graph_init (graph); +        glusterfs_xlator_link (orig_graph->top, graph->top); + +        return 0; +} diff --git a/libglusterfs/src/locking.c b/libglusterfs/src/locking.c index d3b9754ef76..f27b0d05b35 100644 --- a/libglusterfs/src/locking.c +++ b/libglusterfs/src/locking.c @@ -22,7 +22,7 @@ int use_spinlocks = 0;  static void __attribute__((constructor))  gf_lock_setup (void)  { -        use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1); +        //use_spinlocks = (sysconf(_SC_NPROCESSORS_ONLN) > 1);  }  #endif diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 2edebc0aec2..4702ea3eb77 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -406,6 +406,59 @@ out:          return search;  } + +/* + * With brick multiplexing, we sort of have multiple graphs, so + * xlator_search_by_name might not find what we want.  Also, the translator + * we're looking for might not be a direct child if something else was put in + * between (as already happened with decompounder before that was fixed) and + * it's hard to debug why our translator wasn't found.  Using a recursive tree + * search instead of a linear search works around both problems. + */ +static xlator_t * +get_xlator_by_name_or_type (xlator_t *this, char *target, int is_name) +{ +        xlator_list_t   *trav; +        xlator_t        *child_xl; +        char            *value; + +        for (trav = this->children; trav; trav = trav->next) { +                value = is_name ? trav->xlator->name : trav->xlator->type; +                if (strcmp(value, target) == 0) { +                        return trav->xlator; +                } +                child_xl = get_xlator_by_name_or_type (trav->xlator, target, +                                                       is_name); +                if (child_xl) { +                        /* +                         * If the xlator we're looking for is somewhere down +                         * the stack, get_xlator_by_name expects to get a +                         * pointer to the top of its subtree (child of "this") +                         * while get_xlator_by_type expects a pointer to what +                         * we actually found.  Handle both cases here. +                         * +                         * TBD: rename the functions and fix callers to better +                         * reflect the difference in semantics. +                         */ +                        return is_name ? trav->xlator : child_xl; +                } +        } + +        return NULL; +} + +xlator_t * +get_xlator_by_name (xlator_t *this, char *target) +{ +        return get_xlator_by_name_or_type (this, target, 1); +} + +xlator_t * +get_xlator_by_type (xlator_t *this, char *target) +{ +        return get_xlator_by_name_or_type (this, target, 0); +} +  static int  __xlator_init(xlator_t *xl)  { @@ -1104,3 +1157,22 @@ xlator_subvolume_count (xlator_t *this)                  i++;          return i;  } + +static int +_copy_opt_to_child (dict_t *options, char *key, data_t *value, void *data) +{ +        xlator_t        *child = data; + +        gf_log (__func__, GF_LOG_DEBUG, +                "copying %s to child %s", key, child->name); +        dict_set (child->options, key, value); + +        return 0; +} + +int +copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob) +{ +        return dict_foreach_fnmatch (src->options, glob, +                                     _copy_opt_to_child, dst); +} diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index e28790cc034..1e2698bb61f 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -950,6 +950,9 @@ struct _xlator {          /* for the memory pool of 'frame->local' */          struct mem_pool    *local_pool;          gf_boolean_t        is_autoloaded; + +        /* Saved volfile ID (used for multiplexing) */ +        char               *volfile_id;  };  typedef struct { @@ -1004,6 +1007,8 @@ void xlator_foreach_depth_first (xlator_t *this,  				 void *data);  xlator_t *xlator_search_by_name (xlator_t *any, const char *name); +xlator_t *get_xlator_by_name (xlator_t *this, char *target); +xlator_t *get_xlator_by_type (xlator_t *this, char *target);  void  xlator_set_inode_lru_limit (xlator_t *this, void *data); @@ -1050,5 +1055,7 @@ xlator_subvolume_count (xlator_t *this);  void xlator_init_lock (void);  void xlator_init_unlock (void); +int +copy_opts_to_child (xlator_t *src, xlator_t *dst, char *glob);  #endif /* _XLATOR_H */  | 
