diff options
author | Mohit Agrawal <moagrawa@redhat.com> | 2018-03-12 19:43:15 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2018-04-19 04:31:51 +0000 |
commit | 0043c63f70776444f69667a4ef9596217ecb42b7 (patch) | |
tree | e6c239e4b27198d40bca329edcce317ded59de09 /xlators/protocol/server | |
parent | be26b0da2f1a7fe336400de6a1c016716983bd38 (diff) |
gluster: Sometimes Brick process is crashed at the time of stopping brick
Problem: Sometimes brick process is getting crashed at the time
of stop brick while brick mux is enabled.
Solution: Brick process was getting crashed because of rpc connection
was not cleaning properly while brick mux is enabled.In this patch
after sending GF_EVENT_CLEANUP notification to xlator(server)
waits for all rpc client connection destroy for specific xlator.Once rpc
connections are destroyed in server_rpc_notify for all associated client
for that brick then call xlator_mem_cleanup for for brick xlator as well as
all child xlators.To avoid races at the time of cleanup introduce
two new flags at each xlator cleanup_starting, call_cleanup.
BUG: 1544090
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
Note: Run all test-cases in separate build (https://review.gluster.org/#/c/19700/)
with same patch after enable brick mux forcefully, all test cases are
passed.
Change-Id: Ic4ab9c128df282d146cf1135640281fcb31997bf
updates: bz#1544090
Diffstat (limited to 'xlators/protocol/server')
-rw-r--r-- | xlators/protocol/server/src/server-handshake.c | 25 | ||||
-rw-r--r-- | xlators/protocol/server/src/server.c | 170 | ||||
-rw-r--r-- | xlators/protocol/server/src/server.h | 2 |
3 files changed, 148 insertions, 49 deletions
diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c index de90a6b8eda..08f76de9748 100644 --- a/xlators/protocol/server/src/server-handshake.c +++ b/xlators/protocol/server/src/server-handshake.c @@ -474,6 +474,7 @@ server_setvolume (rpcsvc_request_t *req) struct _child_status *tmp = NULL; char *subdir_mount = NULL; char *client_name = NULL; + gf_boolean_t cleanup_starting = _gf_false; params = dict_new (); reply = dict_new (); @@ -575,11 +576,13 @@ server_setvolume (rpcsvc_request_t *req) "initialised yet. Try again later"); goto fail; } + list_for_each_entry (tmp, &conf->child_status->status_list, status_list) { if (strcmp (tmp->name, name) == 0) break; } + if (!tmp->name) { gf_msg (this->name, GF_LOG_ERROR, 0, PS_MSG_CHILD_STATUS_FAILED, @@ -593,6 +596,7 @@ server_setvolume (rpcsvc_request_t *req) "Failed to set 'child_up' for xlator %s " "in the reply dict", tmp->name); } + ret = dict_get_str (params, "process-uuid", &client_uid); if (ret < 0) { ret = dict_set_str (reply, "ERROR", @@ -634,8 +638,27 @@ server_setvolume (rpcsvc_request_t *req) goto fail; } - if (req->trans->xl_private != client) + pthread_mutex_lock (&conf->mutex); + if (xl->cleanup_starting) { + cleanup_starting = _gf_true; + } else if (req->trans->xl_private != client) { req->trans->xl_private = client; + } + pthread_mutex_unlock (&conf->mutex); + + if (cleanup_starting) { + op_ret = -1; + op_errno = EAGAIN; + + ret = dict_set_str (reply, "ERROR", + "cleanup flag is set for xlator. " + " Try again later"); + if (ret < 0) + gf_msg_debug (this->name, 0, "failed to set error: " + "cleanup flag is set for xlator. " + "Try again later"); + goto fail; + } auth_set_username_passwd (params, config_params, client); if (req->trans->ssl_name) { diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index fe1fb71a7ef..03138689b14 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -423,7 +423,16 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, server_conf_t *conf = NULL; client_t *client = NULL; char *auth_path = NULL; - int ret = -1; + int ret = -1; + gf_boolean_t victim_found = _gf_false; + char *xlator_name = NULL; + glusterfs_ctx_t *ctx = NULL; + xlator_t *top = NULL; + xlator_list_t **trav_p = NULL; + xlator_t *travxl = NULL; + uint64_t xprtrefcount = 0; + struct _child_status *tmp = NULL; + if (!xl || !data) { gf_msg_callingfn ("server", GF_LOG_WARNING, 0, @@ -435,6 +444,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, this = xl; trans = data; conf = this->private; + ctx = glusterfsd_ctx; switch (event) { case RPCSVC_EVENT_ACCEPT: @@ -520,9 +530,47 @@ unref_transport: client = trans->xl_private; if (!client) break; + pthread_mutex_lock (&conf->mutex); + list_for_each_entry (tmp, &conf->child_status->status_list, + status_list) { + if (tmp->name && client->bound_xl && + client->bound_xl->cleanup_starting && + !strcmp (tmp->name, client->bound_xl->name)) { + xprtrefcount = GF_ATOMIC_GET (tmp->xprtrefcnt); + if (xprtrefcount > 0) { + xprtrefcount = GF_ATOMIC_DEC (tmp->xprtrefcnt); + if (xprtrefcount == 0) + xlator_name = gf_strdup(client->bound_xl->name); + } + break; + } + } + pthread_mutex_unlock (&conf->mutex); gf_client_unref (client); + if (xlator_name) { + if (this->ctx->active) { + top = this->ctx->active->first; + LOCK (&ctx->volfile_lock); + for (trav_p = &top->children; *trav_p; + trav_p = &(*trav_p)->next) { + travxl = (*trav_p)->xlator; + if (!travxl->call_cleanup && + strcmp (travxl->name, xlator_name) == 0) { + victim_found = _gf_true; + break; + } + } + UNLOCK (&ctx->volfile_lock); + if (victim_found) { + xlator_mem_cleanup (travxl); + rpcsvc_autoscale_threads (ctx, conf->rpc, -1); + } + } + GF_FREE (xlator_name); + } + trans->xl_private = NULL; break; default: @@ -966,6 +1014,7 @@ server_init (xlator_t *this) conf->child_status = GF_CALLOC (1, sizeof (struct _child_status), gf_server_mt_child_status); INIT_LIST_HEAD (&conf->child_status->status_list); + GF_ATOMIC_INIT (conf->child_status->xprtrefcnt, 0); /*ret = dict_get_str (this->options, "statedump-path", &statedump_path); if (!ret) { @@ -1331,14 +1380,53 @@ server_process_child_event (xlator_t *this, int32_t event, void *data, int ret = -1; server_conf_t *conf = NULL; rpc_transport_t *xprt = NULL; + xlator_t *victim = NULL; + struct _child_status *tmp = NULL; GF_VALIDATE_OR_GOTO(this->name, data, out); conf = this->private; GF_VALIDATE_OR_GOTO(this->name, conf, out); + victim = data; pthread_mutex_lock (&conf->mutex); { + if (cbk_procnum == GF_CBK_CHILD_UP) { + list_for_each_entry (tmp, &conf->child_status->status_list, + status_list) { + if (tmp->name == NULL) + break; + if (strcmp (tmp->name, victim->name) == 0) + break; + } + if (tmp->name) { + tmp->child_up = _gf_true; + } else { + tmp = GF_CALLOC (1, sizeof (struct _child_status), + gf_server_mt_child_status); + INIT_LIST_HEAD (&tmp->status_list); + tmp->name = gf_strdup (victim->name); + tmp->child_up = _gf_true; + list_add_tail (&tmp->status_list, + &conf->child_status->status_list); + } + } + + if (cbk_procnum == GF_CBK_CHILD_DOWN) { + list_for_each_entry (tmp, &conf->child_status->status_list, + status_list) { + if (strcmp (tmp->name, victim->name) == 0) { + tmp->child_up = _gf_false; + break; + } + } + + if (!tmp->name) + gf_msg (this->name, GF_LOG_ERROR, 0, + PS_MSG_CHILD_STATUS_FAILED, + "No xlator %s is found in " + "child status list", victim->name); + } list_for_each_entry (xprt, &conf->xprt_list, list) { if (!xprt->xl_private) { continue; @@ -1372,6 +1460,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) struct _child_status *tmp = NULL; gf_boolean_t victim_found = _gf_false; glusterfs_ctx_t *ctx = NULL; + gf_boolean_t xprt_found = _gf_false; + uint64_t totxprt = 0; GF_VALIDATE_OR_GOTO (THIS->name, this, out); conf = this->private; @@ -1406,24 +1496,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) case GF_EVENT_CHILD_UP: { - list_for_each_entry (tmp, &conf->child_status->status_list, - status_list) { - if (tmp->name == NULL) - break; - if (strcmp (tmp->name, victim->name) == 0) - break; - } - if (tmp->name) { - tmp->child_up = _gf_true; - } else { - tmp = GF_CALLOC (1, sizeof (struct _child_status), - gf_server_mt_child_status); - INIT_LIST_HEAD (&tmp->status_list); - tmp->name = gf_strdup (victim->name); - tmp->child_up = _gf_true; - list_add_tail (&tmp->status_list, - &conf->child_status->status_list); - } ret = server_process_child_event (this, event, data, GF_CBK_CHILD_UP); if (ret) { @@ -1438,19 +1510,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) case GF_EVENT_CHILD_DOWN: { - list_for_each_entry (tmp, &conf->child_status->status_list, - status_list) { - if (strcmp (tmp->name, victim->name) == 0) { - tmp->child_up = _gf_false; - break; - } - } - if (!tmp->name) - gf_msg (this->name, GF_LOG_ERROR, 0, - PS_MSG_CHILD_STATUS_FAILED, - "No xlator %s is found in " - "child status list", victim->name); - ret = server_process_child_event (this, event, data, GF_CBK_CHILD_DOWN); if (ret) { @@ -1467,6 +1526,28 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) case GF_EVENT_CLEANUP: conf = this->private; pthread_mutex_lock (&conf->mutex); + /* Calculate total no. of xprt available in list for this + brick xlator + */ + list_for_each_entry_safe (xprt, xp_next, + &conf->xprt_list, list) { + if (!xprt->xl_private) { + continue; + } + if (xprt->xl_private->bound_xl == data) { + totxprt++; + } + } + + list_for_each_entry (tmp, &conf->child_status->status_list, + status_list) { + if (strcmp (tmp->name, victim->name) == 0) { + tmp->child_up = _gf_false; + GF_ATOMIC_INIT (tmp->xprtrefcnt, totxprt); + break; + } + } + /* * Disconnecting will (usually) drop the last ref, which will * cause the transport to be unlinked and freed while we're @@ -1482,18 +1563,11 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) gf_log (this->name, GF_LOG_INFO, "disconnecting %s", xprt->peerinfo.identifier); + xprt_found = _gf_true; rpc_transport_disconnect (xprt, _gf_false); } } - list_for_each_entry (tmp, &conf->child_status->status_list, - status_list) { - if (strcmp (tmp->name, victim->name) == 0) - break; - } - if (tmp->name && (strcmp (tmp->name, victim->name) == 0)) { - GF_FREE (tmp->name); - list_del (&tmp->status_list); - } + pthread_mutex_unlock (&conf->mutex); if (this->ctx->active) { top = this->ctx->active->first; @@ -1501,8 +1575,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { travxl = (*trav_p)->xlator; - if (travxl && - strcmp (travxl->name, victim->name) == 0) { + if (!travxl->call_cleanup && + strcmp (travxl->name, victim->name) == 0) { victim_found = _gf_true; break; } @@ -1511,11 +1585,13 @@ server_notify (xlator_t *this, int32_t event, void *data, ...) glusterfs_delete_volfile_checksum (ctx, victim->volfile_id); UNLOCK (&ctx->volfile_lock); - if (victim_found) - (*trav_p) = (*trav_p)->next; + rpc_clnt_mgmt_pmap_signout (ctx, victim->name); - /* we need the protocol/server xlator here as 'this' */ - rpcsvc_autoscale_threads (ctx, conf->rpc, -1); + + if (!xprt_found && victim_found) { + xlator_mem_cleanup (victim); + rpcsvc_autoscale_threads (ctx, conf->rpc, -1); + } } break; diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h index 393219bf290..ea1fbf92919 100644 --- a/xlators/protocol/server/src/server.h +++ b/xlators/protocol/server/src/server.h @@ -99,7 +99,7 @@ struct _child_status { struct list_head status_list; char *name; gf_boolean_t child_up; - + gf_atomic_t xprtrefcnt; }; struct server_conf { rpcsvc_t *rpc; |