diff options
author | Samikshan Bairagya <samikshan@gmail.com> | 2016-06-14 10:52:27 +0530 |
---|---|---|
committer | Jeff Darcy <jdarcy@redhat.com> | 2016-07-05 04:56:58 -0700 |
commit | 807b9a135d697f175fc9933f1d23fb67b0cc6c7d (patch) | |
tree | ee53def468494d1c295e91e28153072d59d0be38 | |
parent | 014e3c67945546a643703223b9d45a90612ecaee (diff) |
glusterd: Don't start bricks if server quorum is not met
Upon glusterd restart if it is observered that the server quorum
isn't met anymore due to changes to the "server-quorum-ratio"
global option, the bricks should be stopped if they are running.
Also if glusterd has been restarted, and if server quorum is not
applicable for a volume, do not restart the bricks corresponding
to the volume to make sure that bricks that have been brought
down purposely, say for maintenance, are not brought up. This
commit moves this check that was previously inside
"glusterd_spawn_daemons" to "glusterd_restart_bricks" instead.
Change-Id: I0a44a2e7cad0739ed7d56d2d67ab58058716de6b
BUG: 1345727
Signed-off-by: Samikshan Bairagya <samikshan@gmail.com>
Reviewed-on: http://review.gluster.org/14758
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
-rw-r--r-- | tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t | 62 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-server-quorum.c | 10 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 70 |
3 files changed, 127 insertions, 15 deletions
diff --git a/tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t b/tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t new file mode 100644 index 00000000000..e5951e0a039 --- /dev/null +++ b/tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t @@ -0,0 +1,62 @@ +#!/bin/bash + +# Test case for quorum validation in glusterd for syncop framework + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../cluster.rc + +cleanup; + +TEST launch_cluster 3 + +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + +# Lets create the volume and set quorum type as a server +TEST $CLI_1 volume create $V0 $H1:$B1/${V0}1 $H2:$B2/${V0}2 $H3:$B3/${V0}3 +TEST $CLI_1 volume set $V0 cluster.server-quorum-type server + +# Start the volume +TEST $CLI_1 volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H1 $B1/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H2 $B2/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H3 $B3/${V0}3 + +# Bring down 2nd and 3rd glusterd +TEST kill_glusterd 2 +TEST kill_glusterd 3 + +# Server quorum is not met. Brick on 1st node must be down +EXPECT "0" brick_up_status_1 $V0 $H1 $B1/${V0}1 + +# Set quorum ratio 95. means 95 % or more than 95% nodes of total available node +# should be available for performing volume operation. +# i.e. Server-side quorum is met if the number of nodes that are available is +# greater than or equal to 'quorum-ratio' times the number of nodes in the +# cluster + +TEST $CLI_1 volume set all cluster.server-quorum-ratio 95 + +# Bring back 2nd glusterd +TEST $glusterd_2 + +EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count + +# Server quorum is still not met. Bricks should be down on 1st and 2nd nodes +EXPECT "0" brick_up_status_1 $V0 $H1 $B1/${V0}1 +EXPECT "0" brick_up_status_1 $V0 $H2 $B2/${V0}2 + +# Bring back 3rd glusterd +TEST $glusterd_3 +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + +# Server quorum is met now. Bricks should be up on all nodes +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H1 $B1/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H2 $B2/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status_1 $V0 $H3 $B3/${V0}3 + +cleanup; + diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c index 7b3f0b79921..ecf9d53b71e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c +++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c @@ -397,9 +397,11 @@ out: return ret; } -/* ret = 1 represents quorum is met or quorum not applicable, - ret = 0 represents quorum is not met -*/ +/* ret = 0 represents quorum is not met + * ret = 1 represents quorum is met + * ret = 2 represents quorum not applicable + */ + int check_quorum_for_brick_start (glusterd_volinfo_t *volinfo, gf_boolean_t node_quorum) @@ -412,7 +414,7 @@ check_quorum_for_brick_start (glusterd_volinfo_t *volinfo, if (node_quorum) ret = 1; } else { - ret = 1; + ret = 2; } return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 9e43908b1a7..627db08972d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -3050,14 +3050,10 @@ int glusterd_spawn_daemons (void *opaque) { glusterd_conf_t *conf = THIS->private; - gf_boolean_t start_bricks = !conf->restart_done; int ret = -1; synclock_lock (&conf->big_lock); - if (start_bricks) { - glusterd_restart_bricks (conf); - conf->restart_done = _gf_true; - } + glusterd_restart_bricks (conf); glusterd_restart_gsyncds (conf); glusterd_restart_rebalance (conf); ret = glusterd_snapdsvc_restart (); @@ -4214,6 +4210,24 @@ out: } int +glusterd_get_global_server_quorum_ratio (dict_t *opts, double *quorum) +{ + int ret = -1; + char *quorum_str = NULL; + + ret = dict_get_str (opts, GLUSTERD_QUORUM_RATIO_KEY, &quorum_str); + if (ret) + goto out; + + ret = gf_string2percent (quorum_str, quorum); + if (ret) + goto out; + ret = 0; +out: + return ret; +} + +int glusterd_get_global_opt_version (dict_t *opts, uint32_t *version) { int ret = -1; @@ -4260,6 +4274,8 @@ glusterd_import_global_opts (dict_t *friend_data) int count = 0; uint32_t local_version = 0; uint32_t remote_version = 0; + double old_quorum = 0.0; + double new_quorum = 0.0; this = THIS; conf = this->private; @@ -4283,19 +4299,41 @@ glusterd_import_global_opts (dict_t *friend_data) goto out; } + /* Not handling ret since server-quorum-ratio might not yet be set */ + ret = glusterd_get_global_server_quorum_ratio (conf->opts, + &old_quorum); + ret = glusterd_get_global_server_quorum_ratio (import_options, + &new_quorum); + ret = glusterd_get_global_opt_version (conf->opts, &local_version); if (ret) goto out; ret = glusterd_get_global_opt_version (import_options, &remote_version); if (ret) goto out; + if (remote_version > local_version) { ret = glusterd_store_options (this, import_options); if (ret) goto out; dict_unref (conf->opts); conf->opts = dict_ref (import_options); + + /* If server quorum ratio has changed, restart bricks to + * recompute if quorum is met. If quorum is not met bricks are + * not started and those already running are stopped + */ + if (old_quorum != new_quorum) { + ret = glusterd_restart_bricks (conf); + if (ret) { + gf_msg ("glusterd", GF_LOG_INFO, 0, + GD_MSG_SERVER_QUORUM_NOT_MET, + "Restarting bricks failed"); + goto out; + } + } } + ret = 0; out: if (import_options) @@ -4321,7 +4359,6 @@ glusterd_compare_friend_data (dict_t *peer_data, int32_t *status, priv = this->private; GF_ASSERT (priv); - ret = glusterd_import_global_opts (peer_data); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -4883,7 +4920,7 @@ glusterd_restart_bricks (glusterd_conf_t *conf) volinfo->volname); /* Check the quorum, if quorum is not met, don't start the - bricks + bricks. Stop bricks in case they are running. */ ret = check_quorum_for_brick_start (volinfo, node_quorum); if (ret == 0) { @@ -4891,11 +4928,22 @@ glusterd_restart_bricks (glusterd_conf_t *conf) GD_MSG_SERVER_QUORUM_NOT_MET, "Skipping brick " "restart for volume %s as quorum is not met", volinfo->volname); + (void) glusterd_stop_bricks (volinfo); continue; - } - cds_list_for_each_entry (brickinfo, &volinfo->bricks, - brick_list) { - glusterd_brick_start (volinfo, brickinfo, _gf_false); + } else if (ret == 2 && conf->restart_done == _gf_true) { + /* If glusterd has been restarted and quorum is not + * applicable then do not restart the bricks as this + * might start bricks brought down purposely, say for + * maintenance + */ + continue; + } else { + cds_list_for_each_entry (brickinfo, &volinfo->bricks, + brick_list) { + glusterd_brick_start (volinfo, brickinfo, + _gf_false); + } + conf->restart_done = _gf_true; } } |