From 82be66ef8e9e3127d41a4c843daf74c1d8aec4aa Mon Sep 17 00:00:00 2001 From: Atin Mukherjee Date: Thu, 26 Oct 2017 14:26:30 +0530 Subject: glusterd: fix brick restart parallelism glusterd's brick restart logic is not always sequential as there is atleast three different ways how the bricks are restarted. 1. through friend-sm and glusterd_spawn_daemons () 2. through friend-sm and handling volume quorum action 3. through friend handshaking when there is a mimatch on quorum on friend import. In a brick multiplexing setup, glusterd ended up trying to spawn the same brick process couple of times as almost in fraction of milliseconds two threads hit glusterd_brick_start () because of which glusterd didn't have any choice of rejecting any one of them as for both the case brick start criteria met. As a solution, it'd be better to control this madness by two different flags, one is a boolean called start_triggered which indicates a brick start has been triggered and it continues to be true till a brick dies or killed, the second is a mutex lock to ensure for a particular brick we don't end up getting into glusterd_brick_start () more than once at same point of time. Change-Id: I292f1e58d6971e111725e1baea1fe98b890b43e2 BUG: 1506513 Signed-off-by: Atin Mukherjee --- xlators/mgmt/glusterd/src/glusterd-handler.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'xlators/mgmt/glusterd/src/glusterd-handler.c') diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 6e4bfdc420b..80896c2b606 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -5954,16 +5954,22 @@ glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) { int ret = -1; cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) { - ret = glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo); + ret = glusterd_get_volinfo_from_brick (brickinfo->path, + &volinfo); if (ret) { - gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL, - "Failed to get volinfo from brick(%s)", - brickinfo->path); + gf_msg (THIS->name, GF_LOG_ERROR, 0, + GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo" + " from brick(%s)", brickinfo->path); goto out; } - cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) { - if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0) - glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED); + cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, + brick_list) { + if (strcmp (brickinfo->path, + brickinfo_tmp->path) == 0) { + glusterd_set_brick_status (brickinfo_tmp, + GF_BRICK_STOPPED); + brickinfo_tmp->start_triggered = _gf_false; + } } } return 0; @@ -6137,8 +6143,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, if (temp == 1) break; } - } else + } else { glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED); + brickinfo->start_triggered = _gf_false; + } break; case RPC_CLNT_DESTROY: -- cgit