diff options
author | Shreyas Siravara <sshreyas@fb.com> | 2017-09-07 16:27:09 -0700 |
---|---|---|
committer | Shreyas Siravara <sshreyas@fb.com> | 2017-09-07 23:55:31 +0000 |
commit | 9423bc2223227661453a8afd5ab940048abeb008 (patch) | |
tree | ca7a7924bde87a309e21e03be1837d667d61bfc7 | |
parent | 4a007de8e1bcef9ce059cdca4dcc510846702e1d (diff) |
cluster/afr: Set AFR UP message as soon as quorum is obtained.
Summary:
AFR currently waits for all children to respond before sending an UP
message. This means that one dead host cal cause us to wait a TCP
timeout (2 mins!) before declaring the volume up.
Now we send an UP as soon as quorum is obtained.
This is a port of D4701919 to 3.8.
Reviewed By: sshreyas
Change-Id: I642d4eb7dc7e0b289e89b7a16abf99a3f98aa8b3
Reviewed-on: https://review.gluster.org/18231
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r-- | tests/basic/afr-early-up.t | 62 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 23 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 35 |
3 files changed, 103 insertions, 17 deletions
diff --git a/tests/basic/afr-early-up.t b/tests/basic/afr-early-up.t new file mode 100644 index 00000000000..703855f4fac --- /dev/null +++ b/tests/basic/afr-early-up.t @@ -0,0 +1,62 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../dht.rc + +function mount_gluster () +{ + local host=$1 + local volume=$2 + local mount=$3 + local timeout=$4 + + if ! glusterfs -s $host --volfile-id $volume $mount; then + echo "N" + return + fi + + if ! timeout -s 9 $timeout stat $mount; then + echo "N" + return + fi + + echo "Y" +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1,2,3}; + +EXPECT "$V0" volinfo_field $V0 'Volume Name'; +EXPECT 'Created' volinfo_field $V0 'Status'; +EXPECT '3' brick_count $V0 + +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +BRICKPORT=$(pgrep -fl glusterfsd | head -n1 | sed 's/^.*listen-port=//') + +TEST $CLI volume set $V0 cluster.quorum-type auto +EXPECT auto volume_option $V0 cluster.quorum-type + +# Use iptables to block access to one of the brick ports. +BRICKPORT=$(pgrep -fla glusterfsd | head -n1 | sed 's/^.*listen-port=//') +iptables -A INPUT -p tcp --dport $BRICKPORT -j DROP +ip6tables -A INPUT -p tcp --dport $BRICKPORT -j DROP + +# Should still be able to mount within 10 seconds even though brick is +# unreachable. +EXPECT "Y" mount_gluster $H0 $V0 $M0 10 + +# Mount should be writable (we should have quorum) +TEST dd if=/dev/zero of=$M0/test.out bs=128K count=1 conv=fsync + +iptables -D INPUT -p tcp --dport $BRICKPORT -j DROP +ip6tables -D INPUT -p tcp --dport $BRICKPORT -j DROP + +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index b34cc83b635..06860448fc5 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -5113,6 +5113,13 @@ afr_notify (xlator_t *this, int32_t event, } UNLOCK (&priv->lock); + if (event == GF_EVENT_CHILD_UP && priv->have_sent_early_up) { + gf_log (this->name, GF_LOG_INFO, + "Suppressing initial up message: sent earlier."); + propagate = 0; + } + + if (priv->quorum_count) { has_quorum = afr_has_quorum (priv->child_up, this); if (!had_quorum && has_quorum) @@ -5129,7 +5136,23 @@ afr_notify (xlator_t *this, int32_t event, if (have_heard_from_all) propagate = 1; + /* If we have just obtained quorum, send an UP even if + * we have not yet heard from all children. No need to wait + * for (possible offline) children to respond, we're not + * going to get any more quorumy. + */ + if (!have_heard_from_all && + has_quorum && + event == GF_EVENT_CHILD_UP && + !priv->have_sent_early_up) { + gf_log (this->name, GF_LOG_INFO, + "Sending early up message due to meeting quorum."); + priv->have_sent_early_up = _gf_true; + propagate = 1; + } + ret = 0; + if (propagate) ret = default_notify (this, event, data); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b61f6f67460..b60822d0ca9 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -128,7 +128,7 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ - gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ gf_boolean_t gfid_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ @@ -148,23 +148,23 @@ typedef struct _afr_private { gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; gf_boolean_t pre_op_compat; /* on/off */ - uint32_t post_op_delay_secs; + uint32_t post_op_delay_secs; unsigned int quorum_count; gf_boolean_t quorum_reads; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; - /* @event_generation: Keeps count of number of events received which can - potentially impact consistency decisions. The events are CHILD_UP - and CHILD_DOWN, when we have to recalculate the freshness/staleness - of copies to detect if changes had happened while the other server - was down. CHILD_DOWN and CHILD_UP can also be received on network - disconnect/reconnects and not necessarily server going down/up. - Recalculating freshness/staleness on network events is equally - important as we might have had a network split brain. - */ - uint32_t event_generation; + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; gf_boolean_t choose_local; gf_boolean_t did_local_discovery; @@ -172,8 +172,9 @@ typedef struct _afr_private { uint64_t sh_readdir_size; gf_boolean_t ensure_durability; char *sh_domain; - char *afr_dirty; + char *afr_dirty; gf_boolean_t halo_enabled; + gf_boolean_t have_sent_early_up; /* Halo geo-replication tunables */ gf_boolean_t halo_failover_enabled; @@ -191,10 +192,10 @@ typedef struct _afr_private { uint64_t spb_choice_timeout; gf_boolean_t need_heal; - /* pump dependencies */ - void *pump_private; - gf_boolean_t use_afr_in_pump; - char *locking_scheme; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; + char *locking_scheme; gf_boolean_t esh_granular; } afr_private_t; |