summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorMohit Agrawal <moagrawa@redhat.com>2018-03-12 19:43:15 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-04-19 04:31:51 +0000
commit0043c63f70776444f69667a4ef9596217ecb42b7 (patch)
treee6c239e4b27198d40bca329edcce317ded59de09 /xlators
parentbe26b0da2f1a7fe336400de6a1c016716983bd38 (diff)
gluster: Sometimes Brick process is crashed at the time of stopping brick
Problem: Sometimes brick process is getting crashed at the time of stop brick while brick mux is enabled. Solution: Brick process was getting crashed because of rpc connection was not cleaning properly while brick mux is enabled.In this patch after sending GF_EVENT_CLEANUP notification to xlator(server) waits for all rpc client connection destroy for specific xlator.Once rpc connections are destroyed in server_rpc_notify for all associated client for that brick then call xlator_mem_cleanup for for brick xlator as well as all child xlators.To avoid races at the time of cleanup introduce two new flags at each xlator cleanup_starting, call_cleanup. BUG: 1544090 Signed-off-by: Mohit Agrawal <moagrawa@redhat.com> Note: Run all test-cases in separate build (https://review.gluster.org/#/c/19700/) with same patch after enable brick mux forcefully, all test cases are passed. Change-Id: Ic4ab9c128df282d146cf1135640281fcb31997bf updates: bz#1544090
Diffstat (limited to 'xlators')
-rw-r--r--xlators/features/changelog/src/changelog-rpc.c9
-rw-r--r--xlators/features/changelog/src/changelog.c5
-rw-r--r--xlators/features/trash/src/trash.c18
-rw-r--r--xlators/performance/io-threads/src/io-threads.c2
-rw-r--r--xlators/protocol/server/src/server-handshake.c25
-rw-r--r--xlators/protocol/server/src/server.c170
-rw-r--r--xlators/protocol/server/src/server.h2
-rw-r--r--xlators/storage/posix/src/posix-common.c6
-rw-r--r--xlators/storage/posix/src/posix-helpers.c14
9 files changed, 190 insertions, 61 deletions
diff --git a/xlators/features/changelog/src/changelog-rpc.c b/xlators/features/changelog/src/changelog-rpc.c
index 4b2b38cad51..852c0694f9a 100644
--- a/xlators/features/changelog/src/changelog-rpc.c
+++ b/xlators/features/changelog/src/changelog-rpc.c
@@ -258,6 +258,15 @@ changelog_handle_probe (rpcsvc_request_t *req)
changelog_probe_req rpc_req = {0,};
changelog_probe_rsp rpc_rsp = {0,};
+
+ this = req->trans->xl;
+ if (this->cleanup_starting) {
+ gf_msg (this->name, GF_LOG_DEBUG, 0,
+ CHANGELOG_MSG_HANDLE_PROBE_ERROR,
+ "cleanup_starting flag is already set for xl");
+ return 0;
+ }
+
ret = xdr_to_generic (req->msg[0],
&rpc_req, (xdrproc_t)xdr_changelog_probe_req);
if (ret < 0) {
diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c
index 19d66b605bb..12997181da4 100644
--- a/xlators/features/changelog/src/changelog.c
+++ b/xlators/features/changelog/src/changelog.c
@@ -2894,6 +2894,7 @@ void
fini (xlator_t *this)
{
changelog_priv_t *priv = NULL;
+ struct list_head queue = {0, };
priv = this->private;
@@ -2901,6 +2902,10 @@ fini (xlator_t *this)
/* terminate RPC server/threads */
changelog_cleanup_rpc (this, priv);
+ /* call barrier_disable to cancel timer */
+ if (priv->barrier_enabled)
+ __chlog_barrier_disable (this, &queue);
+
/* cleanup barrier related objects */
changelog_barrier_pthread_destroy (priv);
diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c
index e8f8b7bf051..8a92685cf4b 100644
--- a/xlators/features/trash/src/trash.c
+++ b/xlators/features/trash/src/trash.c
@@ -2616,16 +2616,24 @@ fini (xlator_t *this)
GF_VALIDATE_OR_GOTO ("trash", this, out);
priv = this->private;
- inode_table = priv->trash_itable;
if (priv) {
- if (priv->newtrash_dir)
+ inode_table = priv->trash_itable;
+ if (priv->newtrash_dir) {
GF_FREE (priv->newtrash_dir);
- if (priv->oldtrash_dir)
+ priv->newtrash_dir = NULL;
+ }
+ if (priv->oldtrash_dir) {
GF_FREE (priv->oldtrash_dir);
- if (priv->brick_path)
+ priv->oldtrash_dir = NULL;
+ }
+ if (priv->brick_path) {
GF_FREE (priv->brick_path);
- if (priv->eliminate)
+ priv->brick_path = NULL;
+ }
+ if (priv->eliminate) {
wipe_eliminate_path (&priv->eliminate);
+ priv->eliminate = NULL;
+ }
if (inode_table) {
inode_table_destroy (inode_table);
priv->trash_itable = NULL;
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 4531137c936..49a515712f5 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -228,7 +228,7 @@ iot_worker (void *data)
"Dropping poisoned request %p.", stub);
call_stub_destroy (stub);
} else {
- call_resume (stub);
+ call_resume (stub);
}
}
stub = NULL;
diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c
index de90a6b8eda..08f76de9748 100644
--- a/xlators/protocol/server/src/server-handshake.c
+++ b/xlators/protocol/server/src/server-handshake.c
@@ -474,6 +474,7 @@ server_setvolume (rpcsvc_request_t *req)
struct _child_status *tmp = NULL;
char *subdir_mount = NULL;
char *client_name = NULL;
+ gf_boolean_t cleanup_starting = _gf_false;
params = dict_new ();
reply = dict_new ();
@@ -575,11 +576,13 @@ server_setvolume (rpcsvc_request_t *req)
"initialised yet. Try again later");
goto fail;
}
+
list_for_each_entry (tmp, &conf->child_status->status_list,
status_list) {
if (strcmp (tmp->name, name) == 0)
break;
}
+
if (!tmp->name) {
gf_msg (this->name, GF_LOG_ERROR, 0,
PS_MSG_CHILD_STATUS_FAILED,
@@ -593,6 +596,7 @@ server_setvolume (rpcsvc_request_t *req)
"Failed to set 'child_up' for xlator %s "
"in the reply dict", tmp->name);
}
+
ret = dict_get_str (params, "process-uuid", &client_uid);
if (ret < 0) {
ret = dict_set_str (reply, "ERROR",
@@ -634,8 +638,27 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if (req->trans->xl_private != client)
+ pthread_mutex_lock (&conf->mutex);
+ if (xl->cleanup_starting) {
+ cleanup_starting = _gf_true;
+ } else if (req->trans->xl_private != client) {
req->trans->xl_private = client;
+ }
+ pthread_mutex_unlock (&conf->mutex);
+
+ if (cleanup_starting) {
+ op_ret = -1;
+ op_errno = EAGAIN;
+
+ ret = dict_set_str (reply, "ERROR",
+ "cleanup flag is set for xlator. "
+ " Try again later");
+ if (ret < 0)
+ gf_msg_debug (this->name, 0, "failed to set error: "
+ "cleanup flag is set for xlator. "
+ "Try again later");
+ goto fail;
+ }
auth_set_username_passwd (params, config_params, client);
if (req->trans->ssl_name) {
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index fe1fb71a7ef..03138689b14 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -423,7 +423,16 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
server_conf_t *conf = NULL;
client_t *client = NULL;
char *auth_path = NULL;
- int ret = -1;
+ int ret = -1;
+ gf_boolean_t victim_found = _gf_false;
+ char *xlator_name = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ xlator_t *top = NULL;
+ xlator_list_t **trav_p = NULL;
+ xlator_t *travxl = NULL;
+ uint64_t xprtrefcount = 0;
+ struct _child_status *tmp = NULL;
+
if (!xl || !data) {
gf_msg_callingfn ("server", GF_LOG_WARNING, 0,
@@ -435,6 +444,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
this = xl;
trans = data;
conf = this->private;
+ ctx = glusterfsd_ctx;
switch (event) {
case RPCSVC_EVENT_ACCEPT:
@@ -520,9 +530,47 @@ unref_transport:
client = trans->xl_private;
if (!client)
break;
+ pthread_mutex_lock (&conf->mutex);
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (tmp->name && client->bound_xl &&
+ client->bound_xl->cleanup_starting &&
+ !strcmp (tmp->name, client->bound_xl->name)) {
+ xprtrefcount = GF_ATOMIC_GET (tmp->xprtrefcnt);
+ if (xprtrefcount > 0) {
+ xprtrefcount = GF_ATOMIC_DEC (tmp->xprtrefcnt);
+ if (xprtrefcount == 0)
+ xlator_name = gf_strdup(client->bound_xl->name);
+ }
+ break;
+ }
+ }
+ pthread_mutex_unlock (&conf->mutex);
gf_client_unref (client);
+ if (xlator_name) {
+ if (this->ctx->active) {
+ top = this->ctx->active->first;
+ LOCK (&ctx->volfile_lock);
+ for (trav_p = &top->children; *trav_p;
+ trav_p = &(*trav_p)->next) {
+ travxl = (*trav_p)->xlator;
+ if (!travxl->call_cleanup &&
+ strcmp (travxl->name, xlator_name) == 0) {
+ victim_found = _gf_true;
+ break;
+ }
+ }
+ UNLOCK (&ctx->volfile_lock);
+ if (victim_found) {
+ xlator_mem_cleanup (travxl);
+ rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+ }
+ }
+ GF_FREE (xlator_name);
+ }
+
trans->xl_private = NULL;
break;
default:
@@ -966,6 +1014,7 @@ server_init (xlator_t *this)
conf->child_status = GF_CALLOC (1, sizeof (struct _child_status),
gf_server_mt_child_status);
INIT_LIST_HEAD (&conf->child_status->status_list);
+ GF_ATOMIC_INIT (conf->child_status->xprtrefcnt, 0);
/*ret = dict_get_str (this->options, "statedump-path", &statedump_path);
if (!ret) {
@@ -1331,14 +1380,53 @@ server_process_child_event (xlator_t *this, int32_t event, void *data,
int ret = -1;
server_conf_t *conf = NULL;
rpc_transport_t *xprt = NULL;
+ xlator_t *victim = NULL;
+ struct _child_status *tmp = NULL;
GF_VALIDATE_OR_GOTO(this->name, data, out);
conf = this->private;
GF_VALIDATE_OR_GOTO(this->name, conf, out);
+ victim = data;
pthread_mutex_lock (&conf->mutex);
{
+ if (cbk_procnum == GF_CBK_CHILD_UP) {
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (tmp->name == NULL)
+ break;
+ if (strcmp (tmp->name, victim->name) == 0)
+ break;
+ }
+ if (tmp->name) {
+ tmp->child_up = _gf_true;
+ } else {
+ tmp = GF_CALLOC (1, sizeof (struct _child_status),
+ gf_server_mt_child_status);
+ INIT_LIST_HEAD (&tmp->status_list);
+ tmp->name = gf_strdup (victim->name);
+ tmp->child_up = _gf_true;
+ list_add_tail (&tmp->status_list,
+ &conf->child_status->status_list);
+ }
+ }
+
+ if (cbk_procnum == GF_CBK_CHILD_DOWN) {
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (strcmp (tmp->name, victim->name) == 0) {
+ tmp->child_up = _gf_false;
+ break;
+ }
+ }
+
+ if (!tmp->name)
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ PS_MSG_CHILD_STATUS_FAILED,
+ "No xlator %s is found in "
+ "child status list", victim->name);
+ }
list_for_each_entry (xprt, &conf->xprt_list, list) {
if (!xprt->xl_private) {
continue;
@@ -1372,6 +1460,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
struct _child_status *tmp = NULL;
gf_boolean_t victim_found = _gf_false;
glusterfs_ctx_t *ctx = NULL;
+ gf_boolean_t xprt_found = _gf_false;
+ uint64_t totxprt = 0;
GF_VALIDATE_OR_GOTO (THIS->name, this, out);
conf = this->private;
@@ -1406,24 +1496,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CHILD_UP:
{
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (tmp->name == NULL)
- break;
- if (strcmp (tmp->name, victim->name) == 0)
- break;
- }
- if (tmp->name) {
- tmp->child_up = _gf_true;
- } else {
- tmp = GF_CALLOC (1, sizeof (struct _child_status),
- gf_server_mt_child_status);
- INIT_LIST_HEAD (&tmp->status_list);
- tmp->name = gf_strdup (victim->name);
- tmp->child_up = _gf_true;
- list_add_tail (&tmp->status_list,
- &conf->child_status->status_list);
- }
ret = server_process_child_event (this, event, data,
GF_CBK_CHILD_UP);
if (ret) {
@@ -1438,19 +1510,6 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CHILD_DOWN:
{
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (strcmp (tmp->name, victim->name) == 0) {
- tmp->child_up = _gf_false;
- break;
- }
- }
- if (!tmp->name)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- PS_MSG_CHILD_STATUS_FAILED,
- "No xlator %s is found in "
- "child status list", victim->name);
-
ret = server_process_child_event (this, event, data,
GF_CBK_CHILD_DOWN);
if (ret) {
@@ -1467,6 +1526,28 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_CLEANUP:
conf = this->private;
pthread_mutex_lock (&conf->mutex);
+ /* Calculate total no. of xprt available in list for this
+ brick xlator
+ */
+ list_for_each_entry_safe (xprt, xp_next,
+ &conf->xprt_list, list) {
+ if (!xprt->xl_private) {
+ continue;
+ }
+ if (xprt->xl_private->bound_xl == data) {
+ totxprt++;
+ }
+ }
+
+ list_for_each_entry (tmp, &conf->child_status->status_list,
+ status_list) {
+ if (strcmp (tmp->name, victim->name) == 0) {
+ tmp->child_up = _gf_false;
+ GF_ATOMIC_INIT (tmp->xprtrefcnt, totxprt);
+ break;
+ }
+ }
+
/*
* Disconnecting will (usually) drop the last ref, which will
* cause the transport to be unlinked and freed while we're
@@ -1482,18 +1563,11 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
gf_log (this->name, GF_LOG_INFO,
"disconnecting %s",
xprt->peerinfo.identifier);
+ xprt_found = _gf_true;
rpc_transport_disconnect (xprt, _gf_false);
}
}
- list_for_each_entry (tmp, &conf->child_status->status_list,
- status_list) {
- if (strcmp (tmp->name, victim->name) == 0)
- break;
- }
- if (tmp->name && (strcmp (tmp->name, victim->name) == 0)) {
- GF_FREE (tmp->name);
- list_del (&tmp->status_list);
- }
+
pthread_mutex_unlock (&conf->mutex);
if (this->ctx->active) {
top = this->ctx->active->first;
@@ -1501,8 +1575,8 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
for (trav_p = &top->children; *trav_p;
trav_p = &(*trav_p)->next) {
travxl = (*trav_p)->xlator;
- if (travxl &&
- strcmp (travxl->name, victim->name) == 0) {
+ if (!travxl->call_cleanup &&
+ strcmp (travxl->name, victim->name) == 0) {
victim_found = _gf_true;
break;
}
@@ -1511,11 +1585,13 @@ server_notify (xlator_t *this, int32_t event, void *data, ...)
glusterfs_delete_volfile_checksum (ctx,
victim->volfile_id);
UNLOCK (&ctx->volfile_lock);
- if (victim_found)
- (*trav_p) = (*trav_p)->next;
+
rpc_clnt_mgmt_pmap_signout (ctx, victim->name);
- /* we need the protocol/server xlator here as 'this' */
- rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+
+ if (!xprt_found && victim_found) {
+ xlator_mem_cleanup (victim);
+ rpcsvc_autoscale_threads (ctx, conf->rpc, -1);
+ }
}
break;
diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h
index 393219bf290..ea1fbf92919 100644
--- a/xlators/protocol/server/src/server.h
+++ b/xlators/protocol/server/src/server.h
@@ -99,7 +99,7 @@ struct _child_status {
struct list_head status_list;
char *name;
gf_boolean_t child_up;
-
+ gf_atomic_t xprtrefcnt;
};
struct server_conf {
rpcsvc_t *rpc;
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
index 507bfc20991..bcaad2703e9 100644
--- a/xlators/storage/posix/src/posix-common.c
+++ b/xlators/storage/posix/src/posix-common.c
@@ -1105,12 +1105,13 @@ posix_fini (xlator_t *this)
struct posix_private *priv = this->private;
if (!priv)
return;
- this->private = NULL;
- if (priv->health_check) {
+ LOCK (&priv->lock);
+ if (priv->health_check_active) {
priv->health_check_active = _gf_false;
pthread_cancel (priv->health_check);
priv->health_check = 0;
}
+ UNLOCK (&priv->lock);
if (priv->disk_space_check) {
priv->disk_space_check_active = _gf_false;
pthread_cancel (priv->disk_space_check);
@@ -1135,6 +1136,7 @@ posix_fini (xlator_t *this)
GF_FREE (priv->hostname);
GF_FREE (priv->trash_path);
GF_FREE (priv);
+ this->private = NULL;
return;
}
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 0ff94df944e..e9d379fda07 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -2001,6 +2001,12 @@ out:
return NULL;
abort:
+ LOCK (&priv->lock);
+ {
+ priv->health_check_active = _gf_false;
+ }
+ UNLOCK (&priv->lock);
+
/* health-check failed */
gf_msg (this->name, GF_LOG_EMERG, 0, P_MSG_HEALTHCHECK_FAILED,
"health-check failed, going down");
@@ -2041,18 +2047,18 @@ abort:
for (trav_p = &top->children; *trav_p;
trav_p = &(*trav_p)->next) {
victim = (*trav_p)->xlator;
- if (victim &&
- strcmp (victim->name, priv->base_path) == 0) {
+ if (!victim->call_cleanup &&
+ strcmp (victim->name, priv->base_path) == 0) {
victim_found = _gf_true;
break;
}
}
UNLOCK (&ctx->volfile_lock);
- if (victim_found) {
+ if (victim_found && !victim->cleanup_starting) {
gf_log (THIS->name, GF_LOG_INFO, "detaching not-only "
" child %s", priv->base_path);
+ victim->cleanup_starting = 1;
top->notify (top, GF_EVENT_CLEANUP, victim);
- xlator_mem_cleanup (victim);
}
}