3 files changed, 94 insertions, 2 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index fe099f92f60..be18ed9f305 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -122,6 +122,7 @@ call_bail (void *data)
         struct iovec           iov = {0,};
         char                   peerid[UNIX_PATH_MAX] = {0};
         gf_boolean_t           need_unref = _gf_false;
+        gf_boolean_t           timedout_frames = _gf_false;
 
         GF_VALIDATE_OR_GOTO ("client", data, out);
 
@@ -198,7 +199,6 @@ call_bail (void *data)
                         "--",
                         trav->rpcreq->procnum, trav->rpcreq->xid, frame_sent,
                         conn->frame_timeout, peerid);
-
                 clnt = rpc_clnt_ref (clnt);
                 trav->rpcreq->rpc_status = -1;
 		trav->rpcreq->cbkfn (trav->rpcreq, &iov, 1, trav->frame);
@@ -207,7 +207,30 @@ call_bail (void *data)
                 clnt = rpc_clnt_unref (clnt);
                 list_del_init (&trav->list);
                 mem_put (trav);
-        }
+                timedout_frames = _gf_true;
+        }
+        /* So what on earth is this you ask?  It was observed while testing
+         * the SHD threading code, that under high loads SHD/AFR related
+         * SyncOps & SyncTasks can actually hang/deadlock as the transport
+         * disconnected event never gets bubbled up correctly.  Various
+         * tests indicated the ping timeouts worked fine, while "frame timeouts"
+         * did not.  The only difference?  Ping timeouts actually disconnect
+         * the transport while frame timeouts did not.  So from a high-level we
+         * know this prevents deadlock as subsequent tests showed the deadlocks
+         * no longer ocurred (after this change).  That said, there may be some
+         * more elegant solution.  For now though, forcing a reconnect is
+         * preferential vs hanging clients or deadlocking the SHD.
+         *
+         * I suspect the culprit might be in
+         * afr-self-heal-common.c:afr_sh_common_lookup_cbk as this function
+         * will early-return if the callcount never actually reaches 0,
+         * which ordinarily is fine (you only want your callback called if
+         * the Nth response is received), but what happens if callcount
+         * never rearches 0?  The callback won't be called. Theory at this
+         * point, but a good spot to start when we get a chance.
+         */
+        if (timedout_frames)
+                rpc_transport_disconnect (clnt->conn.trans);
 out:
         rpc_clnt_unref (clnt);
         if (need_unref)
diff --git a/tests/bugs/fb4482137.t b/tests/bugs/fb4482137.t
new file mode 100755
index 00000000000..3616ab6022d
--- /dev/null
+++ b/tests/bugs/fb4482137.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+#
+# Test the scenario where a SHD daemon suffers a frame timeout during a
+# crawl.  The expected behavior is that present crawl will continue
+# after the timeout and not deadlock.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+function wait_for_shd_no_sink() {
+  local TIMEOUT=$1
+  # If we see the "no active sinks" log message we know
+  # the heal is alive.  It cannot proceed as the "sink"
+  # is hung, but it's at least alive and trying.
+  timeout $TIMEOUT grep -q 'replicate-0: no active sinks for' \
+    <(tail -fn0 /var/log/glusterfs/glustershd.log)
+  return $?
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info 2> /dev/null;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 network.frame-timeout 2
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.heal-timeout 10
+TEST $CLI volume start $V0
+sleep 5
+
+# Mount the volume
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+  --attribute-timeout=0 --entry-timeout=0
+
+# Kill bricks 1
+TEST kill_brick $V0 $H0 $B0/${V0}1
+sleep 1
+
+# Write some data into the mount which will require healing
+cd $M0
+for i in {1..1000}; do
+  dd if=/dev/urandom of=testdata_$i bs=64k count=1 2>/dev/null
+done
+
+# Re-start the brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+
+sleep 1
+TEST hang_brick $V0 $H0 $B0/${V0}1
+sleep 4
+TEST wait_for_shd_no_sink 20
+cleanup
diff --git a/tests/volume.rc b/tests/volume.rc
index 5ea75a51d22..f75d8969e94 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -237,6 +237,13 @@ function kill_brick {
         kill -9 $(get_brick_pid $vol $host $brick)
 }
 
+function hang_brick {
+        local vol=$1
+        local host=$2
+        local brick=$3
+        kill -STOP $(get_brick_pid $vol $host $brick)
+}
+
 function check_option_help_presence {
         local option=$1
         $CLI volume set help | grep "^Option:" | grep -w $option