From f138d3fa2237e7fa940ecf17153fd700350c4138 Mon Sep 17 00:00:00 2001 From: Mohit Agrawal Date: Tue, 16 Jul 2019 20:36:57 +0530 Subject: posix: In brick_mux brick is crashed while start/stop volume in loop Problem: In brick_mux environment sometime brick is crashed while volume stop/start in a loop.Brick is crashed in janitor task at the time of accessing priv.If posix priv is cleaned up before call janitor task then janitor task is crashed. Solution: To avoid the crash in brick_mux environment introduce a new flag janitor_task_stop in posix_private and before send CHILD_DOWN event wait for update the flag by janitor_task_done Change-Id: Id9fa5d183a463b2b682774ab5cb9868357d139a4 fixes: bz#1730409 Signed-off-by: Mohit Agrawal --- libglusterfs/src/glusterfs/xlator.h | 3 +++ xlators/mgmt/glusterd/src/glusterd-utils.c | 5 ++-- xlators/protocol/server/src/server.c | 6 ++++- xlators/storage/posix/src/posix-common.c | 40 +++++++++++++++++++++++++++++- xlators/storage/posix/src/posix-helpers.c | 16 ++++++++++++ xlators/storage/posix/src/posix.h | 3 +++ 6 files changed, 69 insertions(+), 4 deletions(-) diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h index cdf9d4e8a5d..6449e59f484 100644 --- a/libglusterfs/src/glusterfs/xlator.h +++ b/libglusterfs/src/glusterfs/xlator.h @@ -858,6 +858,9 @@ struct _xlator { /* Flag to notify got CHILD_DOWN event for detach brick */ uint32_t notify_down; + + /* Flag to avoid throw duplicate PARENT_DOWN event */ + uint32_t parent_down; }; /* This would be the only structure which needs to be exported by diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 59b9cbec4a2..d91b672e47e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -4015,8 +4015,9 @@ out: if (msg[0]) { gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s", msg); - gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s", - new_brickinfo->hostname, new_brickinfo->path); + if (new_brickinfo) + gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s", + new_brickinfo->hostname, new_brickinfo->path); } gf_msg_debug("glusterd", 0, "Returning with %d", ret); return ret; diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index 1e91ede0d58..74453edecd7 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -582,6 +582,7 @@ server_graph_janitor_threads(void *data) gf_boolean_t victim_found = _gf_false; xlator_list_t **trav_p = NULL; xlator_t *top = NULL; + uint32_t parent_down = 0; GF_ASSERT(data); @@ -600,7 +601,10 @@ server_graph_janitor_threads(void *data) victim = (*trav_p)->xlator; if (victim->cleanup_starting && strcmp(victim->name, victim_name) == 0) { - victim_found = _gf_true; + parent_down = victim->parent_down; + victim->parent_down = 1; + if (!parent_down) + victim_found = _gf_true; break; } } diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c index 2b39b5bde53..eee39fd000f 100644 --- a/xlators/storage/posix/src/posix-common.c +++ b/xlators/storage/posix/src/posix-common.c @@ -136,10 +136,15 @@ int32_t posix_notify(xlator_t *this, int32_t event, void *data, ...) { xlator_t *victim = data; + struct posix_private *priv = this->private; + int ret = 0; + struct timespec sleep_till = { + 0, + }; switch (event) { case GF_EVENT_PARENT_UP: { - /* Tell the parent that posix xlator is up */ + /* the parent that posix xlator is up */ default_notify(this, GF_EVENT_CHILD_UP, data); } break; @@ -148,6 +153,31 @@ posix_notify(xlator_t *this, int32_t event, void *data, ...) break; gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s", victim->name); + + if (priv->janitor) { + pthread_mutex_lock(&priv->janitor_mutex); + { + priv->janitor_task_stop = _gf_true; + ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, + priv->janitor); + if (!ret) { + clock_gettime(CLOCK_REALTIME, &sleep_till); + sleep_till.tv_sec += 1; + /* Wait to set janitor_task flag to _gf_false by + * janitor_task_done */ + while (priv->janitor_task_stop) { + (void)pthread_cond_timedwait(&priv->janitor_cond, + &priv->janitor_mutex, + &sleep_till); + clock_gettime(CLOCK_REALTIME, &sleep_till); + sleep_till.tv_sec += 1; + } + } + } + pthread_mutex_unlock(&priv->janitor_mutex); + GF_FREE(priv->janitor); + } + priv->janitor = NULL; default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data); } break; default: @@ -997,6 +1027,8 @@ posix_init(xlator_t *this) pthread_mutex_init(&_private->fsync_mutex, NULL); pthread_cond_init(&_private->fsync_cond, NULL); + pthread_mutex_init(&_private->janitor_mutex, NULL); + pthread_cond_init(&_private->janitor_cond, NULL); INIT_LIST_HEAD(&_private->fsyncs); ret = posix_spawn_ctx_janitor_thread(this); if (ret) @@ -1117,6 +1149,7 @@ posix_fini(xlator_t *this) (void)gf_thread_cleanup_xint(priv->disk_space_check); priv->disk_space_check = 0; } + if (priv->janitor) { /*TODO: Make sure the synctask is also complete */ ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor); @@ -1124,8 +1157,10 @@ posix_fini(xlator_t *this) gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED, "Failed to delete janitor timer"); } + GF_FREE(priv->janitor); priv->janitor = NULL; } + if (priv->fsyncer) { (void)gf_thread_cleanup_xint(priv->fsyncer); priv->fsyncer = 0; @@ -1137,6 +1172,9 @@ posix_fini(xlator_t *this) GF_FREE(priv->base_path); LOCK_DESTROY(&priv->lock); pthread_mutex_destroy(&priv->fsync_mutex); + pthread_cond_destroy(&priv->fsync_cond); + pthread_mutex_destroy(&priv->janitor_mutex); + pthread_cond_destroy(&priv->janitor_cond); GF_FREE(priv->hostname); GF_FREE(priv->trash_path); GF_FREE(priv); diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 828aa289a2f..9893058f29e 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1427,12 +1427,24 @@ posix_janitor_task_done(int ret, call_frame_t *frame, void *data) this = data; priv = this->private; + pthread_mutex_lock(&priv->janitor_mutex); + { + if (priv->janitor_task_stop) { + priv->janitor_task_stop = _gf_false; + pthread_cond_signal(&priv->janitor_cond); + pthread_mutex_unlock(&priv->janitor_mutex); + goto out; + } + } + pthread_mutex_unlock(&priv->janitor_mutex); + LOCK(&priv->lock); { __posix_janitor_timer_start(this); } UNLOCK(&priv->lock); +out: return 0; } @@ -1451,6 +1463,9 @@ posix_janitor_task(void *data) old_this = THIS; THIS = this; + if (!priv) + goto out; + time(&now); if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { if (priv->disable_landfill_purge) { @@ -1470,6 +1485,7 @@ posix_janitor_task(void *data) THIS = old_this; +out: return 0; } diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 963daf3f148..9c04c55db02 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -200,6 +200,8 @@ struct posix_private { struct list_head fsyncs; pthread_mutex_t fsync_mutex; pthread_cond_t fsync_cond; + pthread_mutex_t janitor_mutex; + pthread_cond_t janitor_cond; int fsync_queue_count; enum { @@ -254,6 +256,7 @@ struct posix_private { gf_boolean_t fips_mode_rchecksum; gf_boolean_t ctime; + gf_boolean_t janitor_task_stop; }; typedef struct { -- cgit