diff options
author | Mohammed Rafi KC <rkavunga@redhat.com> | 2019-02-25 10:05:32 +0530 |
---|---|---|
committer | Amar Tumballi <amarts@redhat.com> | 2019-04-01 03:44:23 +0000 |
commit | bc3694d7cfc868a2ed6344ea123faf19fce28d13 (patch) | |
tree | 51764aa4445462081273444d5ff2499b1e5375f7 /libglusterfs/src/graph.c | |
parent | 92ae26ae8039847e38c738ef98835a14be9d4296 (diff) |
mgmt/shd: Implement multiplexing in self heal daemon
Problem:
Shd daemon is per node, which means they create a graph
with all volumes on it. While this is a great for utilizing
resources, it is so good in terms of performance and managebility.
Because self-heal daemons doesn't have capability to automatically
reconfigure their graphs. So each time when any configurations
changes happens to the volumes(replicate/disperse), we need to restart
shd to bring the changes into the graph.
Because of this all on going heal for all other volumes has to be
stopped in the middle, and need to restart all over again.
Solution:
This changes makes shd as a per volume daemon, so that the graph
will be generated for each volumes.
When we want to start/reconfigure shd for a volume, we first search
for an existing shd running on the node, if there is none, we will
start a new process. If already a daemon is running for shd, then
we will simply detach a graph for a volume and reatach the updated
graph for the volume. This won't touch any of the on going operations
for any other volumes on the shd daemon.
Example of an shd graph when it is per volume
graph
-----------------------
| debug-iostat |
-----------------------
/ | \
/ | \
--------- --------- ----------
| AFR-1 | | AFR-2 | | AFR-3 |
-------- --------- ----------
A running shd daemon with 3 volumes will be like-->
graph
-----------------------
| debug-iostat |
-----------------------
/ | \
/ | \
------------ ------------ ------------
| volume-1 | | volume-2 | | volume-3 |
------------ ------------ ------------
Change-Id: Idcb2698be3eeb95beaac47125565c93370afbd99
fixes: bz#1659708
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
Diffstat (limited to 'libglusterfs/src/graph.c')
-rw-r--r-- | libglusterfs/src/graph.c | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c index bb5e67a5e60..a492dd891fd 100644 --- a/libglusterfs/src/graph.c +++ b/libglusterfs/src/graph.c @@ -114,6 +114,53 @@ out: return cert_depth; } +xlator_t * +glusterfs_get_last_xlator(glusterfs_graph_t *graph) +{ + xlator_t *trav = graph->first; + if (!trav) + return NULL; + + while (trav->next) + trav = trav->next; + + return trav; +} + +xlator_t * +glusterfs_mux_xlator_unlink(xlator_t *pxl, xlator_t *cxl) +{ + xlator_list_t *unlink = NULL; + xlator_list_t *prev = NULL; + xlator_list_t **tmp = NULL; + xlator_t *next_child = NULL; + xlator_t *xl = NULL; + + for (tmp = &pxl->children; *tmp; tmp = &(*tmp)->next) { + if ((*tmp)->xlator == cxl) { + unlink = *tmp; + *tmp = (*tmp)->next; + if (*tmp) + next_child = (*tmp)->xlator; + break; + } + prev = *tmp; + } + + if (!prev) + xl = pxl; + else if (prev->xlator) + xl = prev->xlator->graph->last_xl; + + if (xl) + xl->next = next_child; + if (next_child) + next_child->prev = xl; + + GF_FREE(unlink); + return next_child; +} + int glusterfs_xlator_link(xlator_t *pxl, xlator_t *cxl) { @@ -1092,6 +1139,8 @@ glusterfs_graph_destroy_residual(glusterfs_graph_t *graph) ret = xlator_tree_free_memacct(graph->first); list_del_init(&graph->list); + pthread_mutex_destroy(&graph->mutex); + pthread_cond_destroy(&graph->child_down_cond); GF_FREE(graph); return ret; @@ -1134,6 +1183,25 @@ out: } int +glusterfs_graph_fini(glusterfs_graph_t *graph) +{ + xlator_t *trav = NULL; + + trav = graph->first; + + while (trav) { + if (trav->init_succeeded) { + trav->cleanup_starting = 1; + trav->fini(trav); + trav->init_succeeded = 0; + } + trav = trav->next; + } + + return 0; +} + +int glusterfs_graph_attach(glusterfs_graph_t *orig_graph, char *path, glusterfs_graph_t **newgraph) { @@ -1256,3 +1324,386 @@ glusterfs_graph_attach(glusterfs_graph_t *orig_graph, char *path, return 0; } +int +glusterfs_muxsvc_cleanup_parent(glusterfs_ctx_t *ctx, + glusterfs_graph_t *parent_graph) +{ + if (parent_graph) { + if (parent_graph->first) { + xlator_destroy(parent_graph->first); + } + ctx->active = NULL; + GF_FREE(parent_graph); + parent_graph = NULL; + } + return 0; +} + +void * +glusterfs_graph_cleanup(void *arg) +{ + glusterfs_graph_t *graph = NULL; + glusterfs_ctx_t *ctx = THIS->ctx; + int ret = -1; + graph = arg; + + if (!graph) + return NULL; + + /* To destroy the graph, fitst sent a GF_EVENT_PARENT_DOWN + * Then wait for GF_EVENT_CHILD_DOWN to get on the top + * xl. Once we have GF_EVENT_CHILD_DOWN event, then proceed + * to fini. + * + * During fini call, this will take a last unref on rpc and + * rpc_transport_object. + */ + if (graph->first) + default_notify(graph->first, GF_EVENT_PARENT_DOWN, graph->first); + + ret = pthread_mutex_lock(&graph->mutex); + if (ret != 0) { + gf_msg("glusterfs", GF_LOG_ERROR, EAGAIN, LG_MSG_GRAPH_CLEANUP_FAILED, + "Failed to aquire a lock"); + goto out; + } + /* check and wait for CHILD_DOWN for top xlator*/ + while (graph->used) { + ret = pthread_cond_wait(&graph->child_down_cond, &graph->mutex); + if (ret != 0) + gf_msg("glusterfs", GF_LOG_INFO, 0, LG_MSG_GRAPH_CLEANUP_FAILED, + "cond wait failed "); + } + + ret = pthread_mutex_unlock(&graph->mutex); + if (ret != 0) { + gf_msg("glusterfs", GF_LOG_ERROR, EAGAIN, LG_MSG_GRAPH_CLEANUP_FAILED, + "Failed to release a lock"); + } + + /* Though we got a child down on top xlator, we have to wait until + * all the notifier to exit. Because there should not be any threads + * that access xl variables. + */ + pthread_mutex_lock(&ctx->notify_lock); + { + while (ctx->notifying) + pthread_cond_wait(&ctx->notify_cond, &ctx->notify_lock); + } + pthread_mutex_unlock(&ctx->notify_lock); + + glusterfs_graph_fini(graph); + glusterfs_graph_destroy(graph); +out: + return NULL; +} + +glusterfs_graph_t * +glusterfs_muxsvc_setup_parent_graph(glusterfs_ctx_t *ctx, char *name, + char *type) +{ + glusterfs_graph_t *parent_graph = NULL; + xlator_t *ixl = NULL; + int ret = -1; + parent_graph = GF_CALLOC(1, sizeof(*parent_graph), + gf_common_mt_glusterfs_graph_t); + if (!parent_graph) + goto out; + + INIT_LIST_HEAD(&parent_graph->list); + + ctx->active = parent_graph; + ixl = GF_CALLOC(1, sizeof(*ixl), gf_common_mt_xlator_t); + if (!ixl) + goto out; + + ixl->ctx = ctx; + ixl->graph = parent_graph; + ixl->options = dict_new(); + if (!ixl->options) + goto out; + + ixl->name = gf_strdup(name); + if (!ixl->name) + goto out; + + ixl->is_autoloaded = 1; + + if (xlator_set_type(ixl, type) == -1) { + gf_msg("glusterfs", GF_LOG_ERROR, EINVAL, LG_MSG_GRAPH_SETUP_FAILED, + "%s (%s) set type failed", name, type); + goto out; + } + + glusterfs_graph_set_first(parent_graph, ixl); + parent_graph->top = ixl; + ixl = NULL; + + gettimeofday(&parent_graph->dob, NULL); + fill_uuid(parent_graph->graph_uuid, 128); + parent_graph->id = ctx->graph_id++; + ret = 0; +out: + if (ixl) + xlator_destroy(ixl); + + if (ret) { + glusterfs_muxsvc_cleanup_parent(ctx, parent_graph); + parent_graph = NULL; + } + return parent_graph; +} + +int +glusterfs_process_svc_detach(glusterfs_ctx_t *ctx, gf_volfile_t *volfile_obj) +{ + xlator_t *last_xl = NULL; + glusterfs_graph_t *graph = NULL; + glusterfs_graph_t *parent_graph = NULL; + pthread_t clean_graph = { + 0, + }; + int ret = -1; + xlator_t *xl = NULL; + + if (!ctx || !ctx->active || !volfile_obj) + goto out; + parent_graph = ctx->active; + graph = volfile_obj->graph; + if (graph && graph->first) + xl = graph->first; + + last_xl = graph->last_xl; + if (last_xl) + last_xl->next = NULL; + if (!xl || xl->cleanup_starting) + goto out; + + xl->cleanup_starting = 1; + gf_msg("mgmt", GF_LOG_INFO, 0, LG_MSG_GRAPH_DETACH_STARTED, + "detaching child %s", volfile_obj->vol_id); + + list_del_init(&volfile_obj->volfile_list); + glusterfs_mux_xlator_unlink(parent_graph->top, xl); + parent_graph->last_xl = glusterfs_get_last_xlator(parent_graph); + parent_graph->xl_count -= graph->xl_count; + parent_graph->leaf_count -= graph->leaf_count; + default_notify(xl, GF_EVENT_PARENT_DOWN, xl); + parent_graph->id++; + ret = 0; +out: + if (!ret) { + list_del_init(&volfile_obj->volfile_list); + if (graph) { + ret = gf_thread_create_detached( + &clean_graph, glusterfs_graph_cleanup, graph, "graph_clean"); + if (ret) { + gf_msg("glusterfs", GF_LOG_ERROR, EINVAL, + LG_MSG_GRAPH_CLEANUP_FAILED, + "%s failed to create clean " + "up thread", + volfile_obj->vol_id); + ret = 0; + } + } + GF_FREE(volfile_obj); + } + return ret; +} + +int +glusterfs_process_svc_attach_volfp(glusterfs_ctx_t *ctx, FILE *fp, + char *volfile_id, char *checksum) +{ + glusterfs_graph_t *graph = NULL; + glusterfs_graph_t *parent_graph = NULL; + glusterfs_graph_t *clean_graph = NULL; + int ret = -1; + xlator_t *xl = NULL; + xlator_t *last_xl = NULL; + gf_volfile_t *volfile_obj = NULL; + pthread_t thread_id = { + 0, + }; + + if (!ctx) + goto out; + parent_graph = ctx->active; + graph = glusterfs_graph_construct(fp); + if (!graph) { + gf_msg("glusterfsd", GF_LOG_ERROR, EINVAL, LG_MSG_GRAPH_ATTACH_FAILED, + "failed to construct the graph"); + goto out; + } + graph->last_xl = glusterfs_get_last_xlator(graph); + + for (xl = graph->first; xl; xl = xl->next) { + if (strcmp(xl->type, "mount/fuse") == 0) { + gf_msg("glusterfsd", GF_LOG_ERROR, EINVAL, + LG_MSG_GRAPH_ATTACH_FAILED, + "fuse xlator cannot be specified in volume file"); + goto out; + } + } + + graph->leaf_count = glusterfs_count_leaves(glusterfs_root(graph)); + xl = graph->first; + /* TODO memory leaks everywhere need to free graph in case of error */ + if (glusterfs_graph_prepare(graph, ctx, xl->name)) { + gf_msg("glusterfsd", GF_LOG_WARNING, EINVAL, LG_MSG_GRAPH_ATTACH_FAILED, + "failed to prepare graph for xlator %s", xl->name); + ret = -1; + goto out; + } else if (glusterfs_graph_init(graph)) { + gf_msg("glusterfsd", GF_LOG_WARNING, EINVAL, LG_MSG_GRAPH_ATTACH_FAILED, + "failed to initialize graph for xlator %s", xl->name); + ret = -1; + goto out; + } else if (glusterfs_graph_parent_up(graph)) { + gf_msg("glusterfsd", GF_LOG_WARNING, EINVAL, LG_MSG_GRAPH_ATTACH_FAILED, + "failed to link the graphs for xlator %s ", xl->name); + ret = -1; + goto out; + } + + if (!parent_graph) { + parent_graph = glusterfs_muxsvc_setup_parent_graph(ctx, "glustershd", + "debug/io-stats"); + if (!parent_graph) + goto out; + ((xlator_t *)parent_graph->top)->next = xl; + clean_graph = parent_graph; + } else { + last_xl = parent_graph->last_xl; + if (last_xl) + last_xl->next = xl; + xl->prev = last_xl; + } + parent_graph->last_xl = graph->last_xl; + + ret = glusterfs_xlator_link(parent_graph->top, xl); + if (ret) { + gf_msg("graph", GF_LOG_ERROR, 0, LG_MSG_EVENT_NOTIFY_FAILED, + "parent up notification failed"); + goto out; + } + parent_graph->xl_count += graph->xl_count; + parent_graph->leaf_count += graph->leaf_count; + parent_graph->id++; + + if (!volfile_obj) { + volfile_obj = GF_CALLOC(1, sizeof(gf_volfile_t), gf_common_volfile_t); + if (!volfile_obj) { + ret = -1; + goto out; + } + } + + graph->used = 1; + parent_graph->id++; + list_add(&graph->list, &ctx->graphs); + INIT_LIST_HEAD(&volfile_obj->volfile_list); + volfile_obj->graph = graph; + snprintf(volfile_obj->vol_id, sizeof(volfile_obj->vol_id), "%s", + volfile_id); + memcpy(volfile_obj->volfile_checksum, checksum, + sizeof(volfile_obj->volfile_checksum)); + list_add_tail(&volfile_obj->volfile_list, &ctx->volfile_list); + + gf_log_dump_graph(fp, graph); + graph = NULL; + + ret = 0; +out: + if (ret) { + if (graph) { + gluster_graph_take_reference(graph->first); + ret = gf_thread_create_detached(&thread_id, glusterfs_graph_cleanup, + graph, "graph_clean"); + if (ret) { + gf_msg("glusterfs", GF_LOG_ERROR, EINVAL, + LG_MSG_GRAPH_CLEANUP_FAILED, + "%s failed to create clean " + "up thread", + volfile_id); + ret = 0; + } + } + if (clean_graph) + glusterfs_muxsvc_cleanup_parent(ctx, clean_graph); + } + return ret; +} + +int +glusterfs_mux_volfile_reconfigure(FILE *newvolfile_fp, glusterfs_ctx_t *ctx, + gf_volfile_t *volfile_obj, char *checksum) +{ + glusterfs_graph_t *oldvolfile_graph = NULL; + glusterfs_graph_t *newvolfile_graph = NULL; + + int ret = -1; + + if (!ctx) { + gf_msg("glusterfsd-mgmt", GF_LOG_ERROR, 0, LG_MSG_CTX_NULL, + "ctx is NULL"); + goto out; + } + + /* Change the message id */ + if (!volfile_obj) { + gf_msg("glusterfsd-mgmt", GF_LOG_ERROR, 0, LG_MSG_CTX_NULL, + "failed to get volfile object"); + goto out; + } + + oldvolfile_graph = volfile_obj->graph; + if (!oldvolfile_graph) { + goto out; + } + + newvolfile_graph = glusterfs_graph_construct(newvolfile_fp); + + if (!newvolfile_graph) { + goto out; + } + newvolfile_graph->last_xl = glusterfs_get_last_xlator(newvolfile_graph); + + glusterfs_graph_prepare(newvolfile_graph, ctx, newvolfile_graph->first); + + if (!is_graph_topology_equal(oldvolfile_graph, newvolfile_graph)) { + ret = glusterfs_process_svc_detach(ctx, volfile_obj); + if (ret) { + gf_msg("glusterfsd-mgmt", GF_LOG_ERROR, EINVAL, + LG_MSG_GRAPH_CLEANUP_FAILED, + "Could not detach " + "old graph. Aborting the reconfiguration operation"); + goto out; + } + ret = glusterfs_process_svc_attach_volfp(ctx, newvolfile_fp, + volfile_obj->vol_id, checksum); + goto out; + } + + gf_msg_debug("glusterfsd-mgmt", 0, + "Only options have changed in the" + " new graph"); + + ret = glusterfs_graph_reconfigure(oldvolfile_graph, newvolfile_graph); + if (ret) { + gf_msg_debug("glusterfsd-mgmt", 0, + "Could not reconfigure " + "new options in old graph"); + goto out; + } + memcpy(volfile_obj->volfile_checksum, checksum, + sizeof(volfile_obj->volfile_checksum)); + + ret = 0; +out: + + if (newvolfile_graph) + glusterfs_graph_destroy(newvolfile_graph); + + return ret; +} |