diff options
author | Samikshan Bairagya <samikshan@gmail.com> | 2017-05-16 15:07:21 +0530 |
---|---|---|
committer | Jeff Darcy <jeff@pl.atyp.us> | 2017-05-18 16:45:28 +0000 |
commit | 13e7b3b354a252ad4065f7b2f0f805c40a3c5d18 (patch) | |
tree | 630353075b33f365297050d514621dbbfd394967 /xlators/mgmt | |
parent | 2d5da5ae6013d17e5121b1e0bbdf021590533d07 (diff) |
glusterd: Don't spawn new glusterfsds on node reboot with brick-mux
With brick multiplexing enabled, upon a node reboot new bricks were
not being attached to the first spawned brick process even though
there wasn't any compatibility issues.
The reason for this is that upon glusterd restart after a node
reboot, since brick services aren't running, glusterd starts the
bricks in a "no-wait" mode. So after a brick process is spawned for
the first brick, there isn't enough time for the corresponding pid
file to get populated with a value before the compatibilty check is
made for the next brick.
This commit solves this by iteratively waiting for the pidfile to be
populated in the brick compatibility comparison stage before checking
if the brick process is alive.
Change-Id: Ibd1f8e54c63e4bb04162143c9d70f09918a44aa4
BUG: 1451248
Signed-off-by: Samikshan Bairagya <samikshan@gmail.com>
Reviewed-on: https://review.gluster.org/17307
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/mgmt')
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-handler.c | 6 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-pmap.c | 1 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 18 |
3 files changed, 25 insertions, 0 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 2a5772b6669..f16bc20c01f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -5811,7 +5811,10 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_set_connected (&rpc->conn); gf_msg_debug (this->name, 0, "Connected to %s:%s", brickinfo->hostname, brickinfo->path); + glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED); + brickinfo->started_here = _gf_true; + gf_event (EVENT_BRICK_CONNECTED, "peer=%s;volume=%s;brick=%s", brickinfo->hostname, volinfo->volname, brickinfo->path); @@ -5841,6 +5844,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, GD_MSG_BRICK_DISCONNECTED, "Brick %s:%s has disconnected from glusterd.", brickinfo->hostname, brickinfo->path); + + brickinfo->started_here = _gf_false; + ret = get_volinfo_from_brickid (brickid, &volinfo); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c index c3f5dbc3cd1..2b60b17841b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-pmap.c +++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c @@ -563,6 +563,7 @@ __gluster_pmap_signout (rpcsvc_request_t *req) GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); sys_unlink (pidfile); + brickinfo->started_here = _gf_false; } } diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 47ac842193e..ea8d60cd87b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -2143,6 +2143,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); gf_msg_debug (this->name, 0, "Unlinking pidfile %s", pidfile); (void) sys_unlink (pidfile); + + brickinfo->started_here = _gf_false; out: return ret; } @@ -5170,6 +5172,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf, glusterd_brickinfo_t *other_brick; char pidfile2[PATH_MAX] = {0}; int32_t pid2 = -1; + int16_t retries = 15; /* * If comp_vol is provided, we have to check *volume* compatibility @@ -5212,8 +5215,22 @@ find_compat_brick_in_vol (glusterd_conf_t *conf, if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) { continue; } + GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol, other_brick, conf); + + /* It is possible that the pidfile hasn't yet been populated, + * when bricks are started in "no-wait" mode; for example + * when bricks are started by glusterd_restart_bricks(). So + * wait for the pidfile to be populated with a value before + * checking if the service is running */ + while (retries > 0) { + if (sys_access (pidfile2, F_OK) == 0) + break; + sleep (1); + retries--; + } + if (!gf_is_service_running (pidfile2, &pid2)) { gf_log (this->name, GF_LOG_INFO, "cleaning up dead brick %s:%s", @@ -5457,6 +5474,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, socketpath, brickinfo->path, volinfo->volname); (void) glusterd_brick_connect (volinfo, brickinfo, socketpath); + brickinfo->started_here = _gf_true; } return 0; } |