summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShreyas Siravara <sshreyas@fb.com>2017-09-07 16:27:09 -0700
committerShreyas Siravara <sshreyas@fb.com>2017-09-07 23:55:31 +0000
commit9423bc2223227661453a8afd5ab940048abeb008 (patch)
treeca7a7924bde87a309e21e03be1837d667d61bfc7
parent4a007de8e1bcef9ce059cdca4dcc510846702e1d (diff)
cluster/afr: Set AFR UP message as soon as quorum is obtained.
Summary: AFR currently waits for all children to respond before sending an UP message. This means that one dead host cal cause us to wait a TCP timeout (2 mins!) before declaring the volume up. Now we send an UP as soon as quorum is obtained. This is a port of D4701919 to 3.8. Reviewed By: sshreyas Change-Id: I642d4eb7dc7e0b289e89b7a16abf99a3f98aa8b3 Reviewed-on: https://review.gluster.org/18231 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--tests/basic/afr-early-up.t62
-rw-r--r--xlators/cluster/afr/src/afr-common.c23
-rw-r--r--xlators/cluster/afr/src/afr.h35
3 files changed, 103 insertions, 17 deletions
diff --git a/tests/basic/afr-early-up.t b/tests/basic/afr-early-up.t
new file mode 100644
index 00000000000..703855f4fac
--- /dev/null
+++ b/tests/basic/afr-early-up.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../dht.rc
+
+function mount_gluster ()
+{
+ local host=$1
+ local volume=$2
+ local mount=$3
+ local timeout=$4
+
+ if ! glusterfs -s $host --volfile-id $volume $mount; then
+ echo "N"
+ return
+ fi
+
+ if ! timeout -s 9 $timeout stat $mount; then
+ echo "N"
+ return
+ fi
+
+ echo "Y"
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1,2,3};
+
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
+EXPECT 'Created' volinfo_field $V0 'Status';
+EXPECT '3' brick_count $V0
+
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+BRICKPORT=$(pgrep -fl glusterfsd | head -n1 | sed 's/^.*listen-port=//')
+
+TEST $CLI volume set $V0 cluster.quorum-type auto
+EXPECT auto volume_option $V0 cluster.quorum-type
+
+# Use iptables to block access to one of the brick ports.
+BRICKPORT=$(pgrep -fla glusterfsd | head -n1 | sed 's/^.*listen-port=//')
+iptables -A INPUT -p tcp --dport $BRICKPORT -j DROP
+ip6tables -A INPUT -p tcp --dport $BRICKPORT -j DROP
+
+# Should still be able to mount within 10 seconds even though brick is
+# unreachable.
+EXPECT "Y" mount_gluster $H0 $V0 $M0 10
+
+# Mount should be writable (we should have quorum)
+TEST dd if=/dev/zero of=$M0/test.out bs=128K count=1 conv=fsync
+
+iptables -D INPUT -p tcp --dport $BRICKPORT -j DROP
+ip6tables -D INPUT -p tcp --dport $BRICKPORT -j DROP
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index b34cc83b635..06860448fc5 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -5113,6 +5113,13 @@ afr_notify (xlator_t *this, int32_t event,
}
UNLOCK (&priv->lock);
+ if (event == GF_EVENT_CHILD_UP && priv->have_sent_early_up) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Suppressing initial up message: sent earlier.");
+ propagate = 0;
+ }
+
+
if (priv->quorum_count) {
has_quorum = afr_has_quorum (priv->child_up, this);
if (!had_quorum && has_quorum)
@@ -5129,7 +5136,23 @@ afr_notify (xlator_t *this, int32_t event,
if (have_heard_from_all)
propagate = 1;
+ /* If we have just obtained quorum, send an UP even if
+ * we have not yet heard from all children. No need to wait
+ * for (possible offline) children to respond, we're not
+ * going to get any more quorumy.
+ */
+ if (!have_heard_from_all &&
+ has_quorum &&
+ event == GF_EVENT_CHILD_UP &&
+ !priv->have_sent_early_up) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Sending early up message due to meeting quorum.");
+ priv->have_sent_early_up = _gf_true;
+ propagate = 1;
+ }
+
ret = 0;
+
if (propagate)
ret = default_notify (this, event, data);
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b61f6f67460..b60822d0ca9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -128,7 +128,7 @@ typedef struct _afr_private {
gf_boolean_t metadata_change_log; /* on/off */
gf_boolean_t entry_change_log; /* on/off */
- gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
gf_boolean_t gfid_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
@@ -148,23 +148,23 @@ typedef struct _afr_private {
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
gf_boolean_t pre_op_compat; /* on/off */
- uint32_t post_op_delay_secs;
+ uint32_t post_op_delay_secs;
unsigned int quorum_count;
gf_boolean_t quorum_reads;
char vol_uuid[UUID_SIZE + 1];
int32_t *last_event;
- /* @event_generation: Keeps count of number of events received which can
- potentially impact consistency decisions. The events are CHILD_UP
- and CHILD_DOWN, when we have to recalculate the freshness/staleness
- of copies to detect if changes had happened while the other server
- was down. CHILD_DOWN and CHILD_UP can also be received on network
- disconnect/reconnects and not necessarily server going down/up.
- Recalculating freshness/staleness on network events is equally
- important as we might have had a network split brain.
- */
- uint32_t event_generation;
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
gf_boolean_t choose_local;
gf_boolean_t did_local_discovery;
@@ -172,8 +172,9 @@ typedef struct _afr_private {
uint64_t sh_readdir_size;
gf_boolean_t ensure_durability;
char *sh_domain;
- char *afr_dirty;
+ char *afr_dirty;
gf_boolean_t halo_enabled;
+ gf_boolean_t have_sent_early_up;
/* Halo geo-replication tunables */
gf_boolean_t halo_failover_enabled;
@@ -191,10 +192,10 @@ typedef struct _afr_private {
uint64_t spb_choice_timeout;
gf_boolean_t need_heal;
- /* pump dependencies */
- void *pump_private;
- gf_boolean_t use_afr_in_pump;
- char *locking_scheme;
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+ char *locking_scheme;
gf_boolean_t esh_granular;
} afr_private_t;