summaryrefslogtreecommitdiffstats
path: root/xlators/protocol/server
diff options
context:
space:
mode:
authorMohit Agrawal <moagrawa@redhat.com>2018-03-12 19:43:15 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-04-19 04:31:51 +0000
commit0043c63f70776444f69667a4ef9596217ecb42b7 (patch)
treee6c239e4b27198d40bca329edcce317ded59de09 /xlators/protocol/server
parentbe26b0da2f1a7fe336400de6a1c016716983bd38 (diff)
gluster: Sometimes Brick process is crashed at the time of stopping brick
Problem: Sometimes brick process is getting crashed at the time of stop brick while brick mux is enabled. Solution: Brick process was getting crashed because of rpc connection was not cleaning properly while brick mux is enabled.In this patch after sending GF_EVENT_CLEANUP notification to xlator(server) waits for all rpc client connection destroy for specific xlator.Once rpc connections are destroyed in server_rpc_notify for all associated client for that brick then call xlator_mem_cleanup for for brick xlator as well as all child xlators.To avoid races at the time of cleanup introduce two new flags at each xlator cleanup_starting, call_cleanup. BUG: 1544090 Signed-off-by: Mohit Agrawal <moagrawa@redhat.com> Note: Run all test-cases in separate build (https://review.gluster.org/#/c/19700/) with same patch after enable brick mux forcefully, all test cases are passed. Change-Id: Ic4ab9c128df282d146cf1135640281fcb31997bf updates: bz#1544090
Diffstat (limited to 'xlators/protocol/server')
-rw-r--r--xlators/protocol/server/src/server-handshake.c25
-rw-r--r--xlators/protocol/server/src/server.c170
-rw-r--r--xlators/protocol/server/src/server.h2
3 files changed, 148 insertions, 49 deletions
diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c
index de90a6b8eda..08f76de9748 100644
--- a/xlators/protocol/server/src/server-handshake.c
+++ b/xlators/protocol/server/src/server-handshake.c
@@ -474,6 +474,7 @@ server_setvolume (rpcsvc_request_t *req)
struct _child_status *tmp = NULL;
char *subdir_mount = NULL;
char *client_name = NULL;
+ gf_boolean_t cleanup_starting = _gf_false;
params = dict_new ();
reply = dict_new ();
@@ -575,11 +576,13 @@ server_setvolume (rpcsvc_request_t *req)
"initialised yet. Try again later");
goto fail;
}
+
list_for_each_entry (tmp, &conf->child_status->status_list,
status_list) {
if (strcmp (tmp->name, name) == 0)
break;
}
+
if (!tmp->name) {
gf_msg (this->name, GF_LOG_ERROR, 0,
PS_MSG_CHILD_STATUS_FAILED,
@@ -593,6 +596,7 @@ server_setvolume (rpcsvc_request_t *req)
"Failed to set 'child_up' for xlator %s "
"in the reply dict", tmp->name);
}
+
ret = dict_get_str (params, "process-uuid", &client_uid);
if (ret < 0) {
ret = dict_set_str (reply, "ERROR",
@@ -634,8 +638,27 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if (req->trans->xl_private != client)
+ pthread_mutex_lock (&conf->mutex);
+ if (xl->cleanup_starting) {
+ cleanup_starting = _gf_true;
+ } else if (req->trans->xl_private != client) {
req->trans->xl_private = client;
+ }
+ pthread_mutex_unlock (&conf->mutex);
+
+ if (cleanup_starting) {
+ op_ret = -1;
+ op_errno = EAGAIN;
+
+ ret = dict_set_str (reply, "ERROR",
+ "cleanup flag is set for xlator. "
+ " Try again later");
+ if (ret < 0)
+ gf_msg_debug (this->name, 0, "failed to set error: "
+ "cleanup flag is set for xlator. "
+ "Try again later");
+ goto fail;
+ }
auth_set_username_passwd (params, config_params, client);
if (req->trans->ssl_name) {
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index fe1fb71a7ef..03138689b14 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -423,7 +423,16 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
server_conf_t *conf = NULL;
client_t *client = NULL;
char *auth_path = NULL;
- int ret = -1;
+ int ret = -1;
+ gf_boolean_t victim_found = _gf_false;
+ char *xlator_name = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ xlator_t *top = NULL;
+ xlator_list_t **trav_p = NULL;
+ xlator_t *travxl = NULL;
+ uint64_t xprtrefcount = 0;
+ struct _child_status *tmp = NULL;
+
if (!xl || !data) {
gf_msg_callingfn ("server", GF_LOG_WARNING, 0,
@@ -435,6 +444,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
this = xl;
trans = data;
conf = this->private;
+ ctx = glusterfsd_ctx;
switch (event) {
case RPCSVC_EVENT_ACCEPT:
@@ -520,9 +530,47 @@ unref_transport:
client = trans->xl_private;
if (!client)
break;
+ pthread_mutex_lock (&conf->mutex);
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (tmp->name && client->bound_xl &&
+ client->bound_xl->cleanup_starting &&
+ !strcmp (tmp->name, client->bound_xl->name)) {
+ xprtrefcount = GF_ATOMIC_GET (tmp->xprtrefcnt);
+ if (xprtrefcount > 0) {
+ xprtrefcount = GF_ATOMIC_DEC (tmp->xprtrefcnt);
+ if (xprtrefcount == 0)
+ xlator_name = gf_strdup(client->bound_xl->name);
+ }
+ break;
+ }
+ }
+ pthread_mutex_unlock (&conf->mutex);
gf_client_unref (client);
+ if (xlator_name) {
+ if (this->ctx->active) {
+ top = this->ctx->active->first;
+ LOCK (&ctx->volfile_lock);
+ for (trav_p = &top->children; *trav_p;
+ trav_p = &(*trav_p)->next) {
+ travxl = (*trav_p)->xlator;
+ if (!travxl->call_cleanup &&
+ strcmp (travxl->name, xlator_name) == 0) {
+ victim_found = _gf_true;
+ break;
+ }
+ }
+ UNLOCK (&ctx->volfile_lock);
+ if (victim_found) {
+ xlator_mem_cleanup (travxl);
+ rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+ }
+ }
+ GF_FREE (xlator_name);
+ }
+
trans->xl_private = NULL;
break;
default:
@@ -966,6 +1014,7 @@ server_init (xlator_t *this)
conf->child_status = GF_CALLOC (1, sizeof (struct _child_status),
gf_server_mt_child_status);
INIT_LIST_HEAD (&conf->child_status->status_list);
+ GF_ATOMIC_INIT (conf->child_status->xprtrefcnt, 0);
/*ret = dict_get_str (this->options, "statedump-path", &statedump_path);
if (!ret) {
@@ -1331,14 +1380,53 @@ server_process_child_event (xlator_t *this, int32_t event, void *data,
int ret = -1;
server_conf_t *conf = NULL;
rpc_transport_t *xprt = NULL;
+ xlator_t *victim = NULL;
+ struct _child_status *tmp = NULL;
GF_VALIDATE_OR_GOTO(this->name, data, out);
conf = this->private;
GF_VALIDATE_OR_GOTO(this->name, conf, out);
+ victim = data;
pthread_mutex_lock (&conf->mutex);
{
+ if (cbk_procnum == GF_CBK_CHILD_UP) {
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (tmp->name == NULL)
+ break;
+ if (strcmp (tmp->name, victim->name) == 0)
+ break;
+ }
+ if (tmp->name) {
+ tmp->child_up = _gf_true;
+ } else {
+ tmp = GF_CALLOC (1, sizeof (struct _child_status),
+ gf_server_mt_child_status);
+ INIT_LIST_HEAD (&tmp->status_list);
+ tmp->name = gf_strdup (victim->name);
+ tmp->child_up = _gf_true;
+ list_add_tail (&tmp->status_list,
+ &conf->child_status->status_list);
+ }
+ }
+
+ if (cbk_procnum == GF_CBK_CHILD_DOWN) {
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (strcmp (tmp->name, victim->name) == 0) {
+ tmp->child_up = _gf_false;
+ break;
+ }
+ }
+
+ if (!tmp->name)
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ PS_MSG_CHILD_STATUS_FAILED,
+ "No xlator %s is found in "
+ "child status list", victim->name);
+ }
list_for_each_entry (xprt, &conf->xprt_list, list) {
if (!xprt->xl_private) {
continue;
@@ -1372,6 +1460,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
struct _child_status *tmp = NULL;
gf_boolean_t victim_found = _gf_false;
glusterfs_ctx_t *ctx = NULL;
+ gf_boolean_t xprt_found = _gf_false;
+ uint64_t totxprt = 0;
GF_VALIDATE_OR_GOTO (THIS->name, this, out);
conf = this->private;
@@ -1406,24 +1496,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CHILD_UP:
{
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (tmp->name == NULL)
- break;
- if (strcmp (tmp->name, victim->name) == 0)
- break;
- }
- if (tmp->name) {
- tmp->child_up = _gf_true;
- } else {
- tmp = GF_CALLOC (1, sizeof (struct _child_status),
- gf_server_mt_child_status);
- INIT_LIST_HEAD (&tmp->status_list);
- tmp->name = gf_strdup (victim->name);
- tmp->child_up = _gf_true;
- list_add_tail (&tmp->status_list,
- &conf->child_status->status_list);
- }
ret = server_process_child_event (this, event, data,
GF_CBK_CHILD_UP);
if (ret) {
@@ -1438,19 +1510,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CHILD_DOWN:
{
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (strcmp (tmp->name, victim->name) == 0) {
- tmp->child_up = _gf_false;
- break;
- }
- }
- if (!tmp->name)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- PS_MSG_CHILD_STATUS_FAILED,
- "No xlator %s is found in "
- "child status list", victim->name);
-
ret = server_process_child_event (this, event, data,
GF_CBK_CHILD_DOWN);
if (ret) {
@@ -1467,6 +1526,28 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CLEANUP:
conf = this->private;
pthread_mutex_lock (&conf->mutex);
+ /* Calculate total no. of xprt available in list for this
+ brick xlator
+ */
+ list_for_each_entry_safe (xprt, xp_next,
+ &conf->xprt_list, list) {
+ if (!xprt->xl_private) {
+ continue;
+ }
+ if (xprt->xl_private->bound_xl == data) {
+ totxprt++;
+ }
+ }
+
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (strcmp (tmp->name, victim->name) == 0) {
+ tmp->child_up = _gf_false;
+ GF_ATOMIC_INIT (tmp->xprtrefcnt, totxprt);
+ break;
+ }
+ }
+
/*
* Disconnecting will (usually) drop the last ref, which will
* cause the transport to be unlinked and freed while we're
@@ -1482,18 +1563,11 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
gf_log (this->name, GF_LOG_INFO,
"disconnecting %s",
xprt->peerinfo.identifier);
+ xprt_found = _gf_true;
rpc_transport_disconnect (xprt, _gf_false);
}
}
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (strcmp (tmp->name, victim->name) == 0)
- break;
- }
- if (tmp->name && (strcmp (tmp->name, victim->name) == 0)) {
- GF_FREE (tmp->name);
- list_del (&tmp->status_list);
- }
+
pthread_mutex_unlock (&conf->mutex);
if (this->ctx->active) {
top = this->ctx->active->first;
@@ -1501,8 +1575,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
for (trav_p = &top->children; *trav_p;
trav_p = &(*trav_p)->next) {
travxl = (*trav_p)->xlator;
- if (travxl &&
- strcmp (travxl->name, victim->name) == 0) {
+ if (!travxl->call_cleanup &&
+ strcmp (travxl->name, victim->name) == 0) {
victim_found = _gf_true;
break;
}
@@ -1511,11 +1585,13 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
glusterfs_delete_volfile_checksum (ctx,
victim->volfile_id);
UNLOCK (&ctx->volfile_lock);
- if (victim_found)
- (*trav_p) = (*trav_p)->next;
+
rpc_clnt_mgmt_pmap_signout (ctx, victim->name);
- /* we need the protocol/server xlator here as 'this' */
- rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+
+ if (!xprt_found && victim_found) {
+ xlator_mem_cleanup (victim);
+ rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+ }
}
break;
diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h
index 393219bf290..ea1fbf92919 100644
--- a/xlators/protocol/server/src/server.h
+++ b/xlators/protocol/server/src/server.h
@@ -99,7 +99,7 @@ struct _child_status {
struct list_head status_list;
char *name;
gf_boolean_t child_up;
-
+ gf_atomic_t xprtrefcnt;
};
struct server_conf {
rpcsvc_t *rpc;