cluster/afr: Set AFR UP message as soon as quorum is obtained.

Summary: AFR currently waits for all children to respond before sending an UP message. This means that one dead host cal cause us to wait a TCP timeout (2 mins!) before declaring the volume up. Now we send an UP as soon as quorum is obtained. This is a port of D4701919 to 3.8. Reviewed By: sshreyas Change-Id: I642d4eb7dc7e0b289e89b7a16abf99a3f98aa8b3 Reviewed-on: https://review.gluster.org/18231 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
author: Shreyas Siravara <sshreyas@fb.com> 2017-09-07 16:27:09 -0700
committer: Shreyas Siravara <sshreyas@fb.com> 2017-09-07 23:55:31 +0000
commit: 9423bc2223227661453a8afd5ab940048abeb008 (patch)
tree: ca7a7924bde87a309e21e03be1837d667d61bfc7
parent: 4a007de8e1bcef9ce059cdca4dcc510846702e1d (diff)
3 files changed, 103 insertions, 17 deletions
diff --git a/tests/basic/afr-early-up.t b/tests/basic/afr-early-up.t
new file mode 100644
index 00000000000..703855f4fac
--- /dev/null
+++ b/tests/basic/afr-early-up.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../dht.rc
+
+function mount_gluster ()
+{
+  local host=$1
+  local volume=$2
+  local mount=$3
+  local timeout=$4
+
+  if ! glusterfs -s $host --volfile-id $volume $mount; then
+    echo "N"
+    return
+  fi
+
+  if ! timeout -s 9 $timeout stat $mount; then
+    echo "N"
+    return
+  fi
+
+  echo "Y"
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 replica 3  $H0:$B0/${V0}{1,2,3};
+
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
+EXPECT 'Created' volinfo_field $V0 'Status';
+EXPECT '3' brick_count $V0
+
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+BRICKPORT=$(pgrep -fl glusterfsd | head -n1 | sed 's/^.*listen-port=//')
+
+TEST $CLI volume set $V0 cluster.quorum-type auto
+EXPECT auto volume_option $V0 cluster.quorum-type
+
+# Use iptables to block access to one of the brick ports.
+BRICKPORT=$(pgrep -fla glusterfsd | head -n1 | sed 's/^.*listen-port=//')
+iptables -A INPUT -p tcp --dport $BRICKPORT -j DROP
+ip6tables -A INPUT -p tcp --dport $BRICKPORT -j DROP
+
+# Should still be able to mount within 10 seconds even though brick is
+# unreachable.
+EXPECT "Y" mount_gluster $H0 $V0 $M0 10
+
+# Mount should be writable (we should have quorum)
+TEST dd if=/dev/zero of=$M0/test.out bs=128K count=1 conv=fsync
+
+iptables -D INPUT -p tcp --dport $BRICKPORT -j DROP
+ip6tables -D INPUT -p tcp --dport $BRICKPORT -j DROP
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index b34cc83b635..06860448fc5 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -5113,6 +5113,13 @@ afr_notify (xlator_t *this, int32_t event,
         }
         UNLOCK (&priv->lock);
 
+        if (event == GF_EVENT_CHILD_UP && priv->have_sent_early_up) {
+                gf_log (this->name, GF_LOG_INFO,
+                        "Suppressing initial up message: sent earlier.");
+                propagate = 0;
+        }
+
+
         if (priv->quorum_count) {
                 has_quorum = afr_has_quorum (priv->child_up, this);
                 if (!had_quorum && has_quorum)
@@ -5129,7 +5136,23 @@ afr_notify (xlator_t *this, int32_t event,
         if (have_heard_from_all)
                 propagate = 1;
 
+        /* If we have just obtained quorum, send an UP even if
+         * we have not yet heard from all children. No need to wait
+         * for (possible offline) children to respond, we're not
+         * going to get any more quorumy.
+         */
+        if (!have_heard_from_all &&
+            has_quorum &&
+            event == GF_EVENT_CHILD_UP &&
+            !priv->have_sent_early_up) {
+                gf_log (this->name, GF_LOG_INFO,
+                        "Sending early up message due to meeting quorum.");
+                priv->have_sent_early_up = _gf_true;
+                propagate = 1;
+        }
+
         ret = 0;
+
         if (propagate)
                 ret = default_notify (this, event, data);
 
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b61f6f67460..b60822d0ca9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -128,7 +128,7 @@ typedef struct _afr_private {
         gf_boolean_t metadata_change_log;   /* on/off */
         gf_boolean_t entry_change_log;      /* on/off */
 
-	gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+	      gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
         gf_boolean_t gfid_splitbrain_forced_heal;  /* on/off */
         int read_child;               /* read-subvolume */
         unsigned int hash_mode;       /* for when read_child is not set */
@@ -148,23 +148,23 @@ typedef struct _afr_private {
         gf_boolean_t      optimistic_change_log;
         gf_boolean_t      eager_lock;
         gf_boolean_t      pre_op_compat;      /* on/off */
-	uint32_t          post_op_delay_secs;
+	      uint32_t          post_op_delay_secs;
         unsigned int      quorum_count;
         gf_boolean_t      quorum_reads;
 
         char                   vol_uuid[UUID_SIZE + 1];
         int32_t                *last_event;
 
-	/* @event_generation: Keeps count of number of events received which can
-	   potentially impact consistency decisions. The events are CHILD_UP
-	   and CHILD_DOWN, when we have to recalculate the freshness/staleness
-	   of copies to detect if changes had happened while the other server
-	   was down. CHILD_DOWN and CHILD_UP can also be received on network
-	   disconnect/reconnects and not necessarily server going down/up.
-	   Recalculating freshness/staleness on network events is equally
-	   important as we might have had a network split brain.
-	*/
-	uint32_t               event_generation;
+        /* @event_generation: Keeps count of number of events received which can
+           potentially impact consistency decisions. The events are CHILD_UP
+           and CHILD_DOWN, when we have to recalculate the freshness/staleness
+           of copies to detect if changes had happened while the other server
+           was down. CHILD_DOWN and CHILD_UP can also be received on network
+           disconnect/reconnects and not necessarily server going down/up.
+           Recalculating freshness/staleness on network events is equally
+           important as we might have had a network split brain.
+        */
+	      uint32_t               event_generation;
 
         gf_boolean_t           choose_local;
         gf_boolean_t           did_local_discovery;
@@ -172,8 +172,9 @@ typedef struct _afr_private {
         uint64_t               sh_readdir_size;
         gf_boolean_t           ensure_durability;
         char                   *sh_domain;
-	char                   *afr_dirty;
+	      char                   *afr_dirty;
         gf_boolean_t           halo_enabled;
+        gf_boolean_t           have_sent_early_up;
 
         /* Halo geo-replication tunables */
         gf_boolean_t           halo_failover_enabled;
@@ -191,10 +192,10 @@ typedef struct _afr_private {
         uint64_t               spb_choice_timeout;
         gf_boolean_t           need_heal;
 
-	/* pump dependencies */
-	void                   *pump_private;
-	gf_boolean_t           use_afr_in_pump;
-	char                   *locking_scheme;
+	      /* pump dependencies */
+	      void                   *pump_private;
+	      gf_boolean_t           use_afr_in_pump;
+	      char                   *locking_scheme;
         gf_boolean_t            esh_granular;
 } afr_private_t;
author	Shreyas Siravara <sshreyas@fb.com>	2017-09-07 16:27:09 -0700
committer	Shreyas Siravara <sshreyas@fb.com>	2017-09-07 23:55:31 +0000
commit	9423bc2223227661453a8afd5ab940048abeb008 (patch)
tree	ca7a7924bde87a309e21e03be1837d667d61bfc7
parent	4a007de8e1bcef9ce059cdca4dcc510846702e1d (diff)