diff options
author | Atin Mukherjee <amukherj@redhat.com> | 2017-10-26 14:26:30 +0530 |
---|---|---|
committer | Atin Mukherjee <amukherj@redhat.com> | 2017-11-01 03:41:36 +0000 |
commit | 82be66ef8e9e3127d41a4c843daf74c1d8aec4aa (patch) | |
tree | 48a91287a7dd949ce7c9cb52760b337ad8a573dc /xlators/mgmt/glusterd/src/glusterd-utils.c | |
parent | bb7fd73ce4245f54517de1f378a9471f6c8bb454 (diff) |
glusterd: fix brick restart parallelism
glusterd's brick restart logic is not always sequential as there is
atleast three different ways how the bricks are restarted.
1. through friend-sm and glusterd_spawn_daemons ()
2. through friend-sm and handling volume quorum action
3. through friend handshaking when there is a mimatch on quorum on
friend import.
In a brick multiplexing setup, glusterd ended up trying to spawn the
same brick process couple of times as almost in fraction of milliseconds
two threads hit glusterd_brick_start () because of which glusterd didn't
have any choice of rejecting any one of them as for both the case brick
start criteria met.
As a solution, it'd be better to control this madness by two different
flags, one is a boolean called start_triggered which indicates a brick
start has been triggered and it continues to be true till a brick dies
or killed, the second is a mutex lock to ensure for a particular brick
we don't end up getting into glusterd_brick_start () more than once at
same point of time.
Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2
BUG: 1506513
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
Diffstat (limited to 'xlators/mgmt/glusterd/src/glusterd-utils.c')
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index a91f8dd7138..f211f199ce6 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -1086,7 +1086,7 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo) goto out; CDS_INIT_LIST_HEAD (&new_brickinfo->brick_list); - + pthread_mutex_init (&new_brickinfo->restart_mutex, NULL); *brickinfo = new_brickinfo; ret = 0; @@ -2500,7 +2500,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, (void) sys_unlink (pidfile); brickinfo->status = GF_BRICK_STOPPED; - + brickinfo->start_triggered = _gf_false; if (del_brick) glusterd_delete_brick (volinfo, brickinfo); out: @@ -5837,13 +5837,14 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, * three different triggers for an attempt to start the brick process * due to the quorum handling code in glusterd_friend_sm. */ - if (brickinfo->status == GF_BRICK_STARTING) { + if (brickinfo->status == GF_BRICK_STARTING || + brickinfo->start_triggered) { gf_msg_debug (this->name, 0, "brick %s is already in starting " "phase", brickinfo->path); ret = 0; goto out; } - + brickinfo->start_triggered = _gf_true; GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); if (gf_is_service_running (pidfile, &pid)) { if (brickinfo->status != GF_BRICK_STARTING && @@ -5956,6 +5957,9 @@ run: } out: + if (ret && brickinfo) { + brickinfo->start_triggered = _gf_false; + } gf_msg_debug (this->name, 0, "returning %d ", ret); return ret; } @@ -6017,11 +6021,19 @@ glusterd_restart_bricks (glusterd_conf_t *conf) start_svcs = _gf_true; glusterd_svcs_manager (NULL); } - cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - glusterd_brick_start (volinfo, brickinfo, - _gf_false); + if (!brickinfo->start_triggered) { + pthread_mutex_lock + (&brickinfo->restart_mutex); + { + glusterd_brick_start + (volinfo, brickinfo, + _gf_false); + } + pthread_mutex_unlock + (&brickinfo->restart_mutex); + } } ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); @@ -6060,8 +6072,17 @@ glusterd_restart_bricks (glusterd_conf_t *conf) "volume %s", volinfo->volname); cds_list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - glusterd_brick_start (volinfo, brickinfo, - _gf_false); + if (!brickinfo->start_triggered) { + pthread_mutex_lock + (&brickinfo->restart_mutex); + { + glusterd_brick_start + (volinfo, brickinfo, + _gf_false); + } + pthread_mutex_unlock + (&brickinfo->restart_mutex); + } } ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE); |