11 files changed, 169 insertions, 12 deletions
diff --git a/libglusterfs/src/defaults-tmpl.c b/libglusterfs/src/defaults-tmpl.c
index 82e7f78d7f3..3cf707f42aa 100644
--- a/libglusterfs/src/defaults-tmpl.c
+++ b/libglusterfs/src/defaults-tmpl.c
@@ -171,8 +171,11 @@ default_notify(xlator_t *this, int32_t event, void *data, ...)
                 /* Make sure this is not a daemon with master xlator */
                 pthread_mutex_lock(&graph->mutex);
                 {
-                    graph->used = 0;
-                    pthread_cond_broadcast(&graph->child_down_cond);
+                    if (graph->parent_down ==
+                        graph_total_client_xlator(graph)) {
+                        graph->used = 0;
+                        pthread_cond_broadcast(&graph->child_down_cond);
+                    }
                 }
                 pthread_mutex_unlock(&graph->mutex);
             }
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index 01262dcd9f5..155bf435386 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -594,6 +594,7 @@ struct _glusterfs_graph {
                       in client multiplexed code path */
     pthread_mutex_t mutex;
     pthread_cond_t child_down_cond; /* for broadcasting CHILD_DOWN */
+    int parent_down;
     char graph_uuid[128];
 };
 typedef struct _glusterfs_graph glusterfs_graph_t;
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
index 6449e59f484..6608d6cdf0d 100644
--- a/libglusterfs/src/glusterfs/xlator.h
+++ b/libglusterfs/src/glusterfs/xlator.h
@@ -1095,4 +1095,6 @@ mgmt_is_multiplexed_daemon(char *name);
 
 gf_boolean_t
 xlator_is_cleanup_starting(xlator_t *this);
+int
+graph_total_client_xlator(glusterfs_graph_t *graph);
 #endif /* _XLATOR_H */
diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c
index bbc5ad68d94..e6ae40db2ed 100644
--- a/libglusterfs/src/graph.c
+++ b/libglusterfs/src/graph.c
@@ -1695,6 +1695,7 @@ glusterfs_process_svc_attach_volfp(glusterfs_ctx_t *ctx, FILE *fp,
                "failed to construct the graph");
         goto out;
     }
+    graph->parent_down = 0;
     graph->last_xl = glusterfs_get_last_xlator(graph);
 
     for (xl = graph->first; xl; xl = xl->next) {
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
index 2e83d3f1003..dc7382ba749 100644
--- a/libglusterfs/src/libglusterfs.sym
+++ b/libglusterfs/src/libglusterfs.sym
@@ -1169,3 +1169,4 @@ glusterfs_process_svc_detach
 mgmt_is_multiplexed_daemon
 xlator_is_cleanup_starting
 gf_nanosleep
+graph_total_client_xlator
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index 9906809f7aa..8605fbd0e6f 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -1542,3 +1542,26 @@ xlator_is_cleanup_starting(xlator_t *this)
 out:
     return cleanup;
 }
+
+int
+graph_total_client_xlator(glusterfs_graph_t *graph)
+{
+    xlator_t *xl = NULL;
+    int count = 0;
+
+    if (!graph) {
+        gf_msg("xlator", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG,
+               "graph object is null");
+        goto out;
+    }
+
+    xl = graph->first;
+    while (xl) {
+        if (strcmp(xl->type, "protocol/client") == 0) {
+            count++;
+        }
+        xl = xl->next;
+    }
+out:
+    return count;
+}
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index 8ef05378351..aa65a1f8766 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -1858,7 +1858,7 @@ rpc_clnt_unref(struct rpc_clnt *rpc)
     return rpc;
 }
 
-void
+int
 rpc_clnt_disable(struct rpc_clnt *rpc)
 {
     rpc_clnt_connection_t *conn = NULL;
@@ -1902,8 +1902,9 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
     }
     pthread_mutex_unlock(&conn->lock);
 
+    ret = -1;
     if (trans) {
-        rpc_transport_disconnect(trans, _gf_true);
+        ret = rpc_transport_disconnect(trans, _gf_true);
         /* The auth_value was being reset to AUTH_GLUSTERFS_v2.
          *    if (clnt->auth_value)
          *           clnt->auth_value = AUTH_GLUSTERFS_v2;
@@ -1919,7 +1920,6 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
          * on a connected transport and hence its strictly serialized.
          */
     }
-
     if (unref)
         rpc_clnt_unref(rpc);
 
@@ -1930,7 +1930,7 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
         rpc_clnt_unref(rpc);
 
 out:
-    return;
+    return ret;
 }
 
 void
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index b46feed50c8..1d3274bbddd 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -250,7 +250,7 @@ int
 rpcclnt_cbk_program_register(struct rpc_clnt *svc,
                              rpcclnt_cb_program_t *program, void *mydata);
 
-void
+int
 rpc_clnt_disable(struct rpc_clnt *rpc);
 
 int
diff --git a/tests/basic/graph-cleanup-brick-down-shd-mux.t b/tests/basic/graph-cleanup-brick-down-shd-mux.t
new file mode 100644
index 00000000000..3c621cdcc26
--- /dev/null
+++ b/tests/basic/graph-cleanup-brick-down-shd-mux.t
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TESTS_EXPECTED_IN_LOOP=4
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2,3,4,5}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 performance.flush-behind off
+TEST $CLI volume start $V0
+
+for i in $(seq 1 2); do
+   TEST $CLI volume create ${V0}_afr$i replica 3 $H0:$B0/${V0}_afr${i}{0,1,2,3,4,5}
+   TEST $CLI volume start ${V0}_afr$i
+   TEST $CLI volume create ${V0}_ec$i disperse 6 redundancy 2 $H0:$B0/${V0}_ec${i}{0,1,2,3,4,5}
+   TEST $CLI volume start ${V0}_ec$i
+done
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
+#Check the thread count become to number of volumes*number of ec subvolume (2*6=12)
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "ec_shd_index_healer"
+#Check the thread count become to number of volumes*number of afr subvolume (3*6=18)
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "afr_shd_index_healer"
+
+#kill one brick and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+#kill an entire subvol and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+#wait for some time to create a race sceanrio
+sleep 1
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+#kill all bricks and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill_brick $V0 $H0 $B0/${V0}3
+TEST kill_brick $V0 $H0 $B0/${V0}4
+TEST kill_brick $V0 $H0 $B0/${V0}5
+#wait for some time to create a race sceanrio
+sleep 2
+
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+cleanup
diff --git a/tests/basic/volume-scale-shd-mux.t b/tests/basic/volume-scale-shd-mux.t
index 89b833d5ddc..d1ddcbca7dd 100644
--- a/tests/basic/volume-scale-shd-mux.t
+++ b/tests/basic/volume-scale-shd-mux.t
@@ -23,8 +23,6 @@ for i in $(seq 1 2); do
 done
 
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
-
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
 #Check the thread count become to number of volumes*number of ec subvolume (2*6=12)
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "__ec_shd_healer_wait"
 #Check the thread count become to number of volumes*number of afr subvolume (3*6=18)
@@ -38,9 +36,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^21$" number_healer_threads_shd $V0 "__afr_sh
 
 #Remove the brick and check the detach is successful
 $CLI volume remove-brick $V0 $H0:$B0/${V0}{6,7,8} force
-
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "__afr_shd_healer_wait"
 
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup"
 TEST $CLI volume add-brick ${V0}_ec1 $H0:$B0/${V0}_ec1_add{0,1,2,3,4,5};
 #Check the thread count become to number of volumes*number of ec subvolume plus 2 additional threads from newly added bricks (2*6+6=18)
 
@@ -92,6 +90,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^9$" number_healer_threads_shd $V0 "__afr_shd
 TEST $CLI volume remove-brick ${V0}_distribute1 replica 1 $H0:$B0/add/{2..3} force
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^6$" number_healer_threads_shd $V0 "__afr_shd_healer_wait"
 
+#Before stopping the process, make sure there is no pending clenup threads hanging
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup"
+
 TEST $CLI volume stop ${V0}
 TEST $CLI volume delete ${V0}
 EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" shd_count
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 776e7160c51..45e7bfedf91 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -61,9 +61,54 @@ out:
 }
 
 int
+client_is_last_child_down(xlator_t *this, int32_t event, struct rpc_clnt *rpc)
+{
+    rpc_clnt_connection_t *conn = NULL;
+    int ret = 0;
+
+    clnt_conf_t *conf = this->private;
+    if (!this || !rpc || !conf)
+        goto out;
+
+    if (!conf->parent_down)
+        goto out;
+    conn = &rpc->conn;
+    pthread_mutex_lock(&conn->lock);
+    {
+        if (event == GF_EVENT_CHILD_DOWN && !conn->reconnect && rpc->disabled) {
+            ret = 1;
+        }
+    }
+    pthread_mutex_unlock(&conn->lock);
+out:
+    return ret;
+}
+
+int
 client_notify_dispatch_uniq(xlator_t *this, int32_t event, void *data, ...)
 {
     clnt_conf_t *conf = this->private;
+    glusterfs_ctx_t *ctx = this->ctx;
+    glusterfs_graph_t *graph = this->graph;
+
+    pthread_mutex_lock(&ctx->notify_lock);
+    {
+        while (ctx->notifying)
+            pthread_cond_wait(&ctx->notify_cond, &ctx->notify_lock);
+
+        if (client_is_last_child_down(this, event, data) && graph) {
+            pthread_mutex_lock(&graph->mutex);
+            {
+                graph->parent_down++;
+                if (graph->parent_down == graph_total_client_xlator(graph)) {
+                    graph->used = 0;
+                    pthread_cond_broadcast(&graph->child_down_cond);
+                }
+            }
+            pthread_mutex_unlock(&graph->mutex);
+        }
+    }
+    pthread_mutex_unlock(&ctx->notify_lock);
 
     if (conf->last_sent_event == event)
         return 0;
@@ -81,6 +126,7 @@ client_notify_dispatch(xlator_t *this, int32_t event, void *data, ...)
 {
     int ret = -1;
     glusterfs_ctx_t *ctx = this->ctx;
+
     clnt_conf_t *conf = this->private;
 
     pthread_mutex_lock(&ctx->notify_lock);
@@ -94,6 +140,7 @@ client_notify_dispatch(xlator_t *this, int32_t event, void *data, ...)
     /* We assume that all translators in the graph handle notification
      * events in sequence.
      * */
+
     ret = default_notify(this, event, data);
 
     /* NB (Even) with MT-epoll and EPOLLET|EPOLLONESHOT we are guaranteed
@@ -2376,7 +2423,7 @@ client_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
                    replicate), hence make sure events which are passed
                    to parent are genuine */
                 ret = client_notify_dispatch_uniq(this, GF_EVENT_CHILD_DOWN,
-                                                  NULL);
+                                                  rpc);
                 if (is_parent_down) {
                     /* If parent is down, then there should not be any
                      * operation after a child down.
@@ -2424,6 +2471,8 @@ int
 notify(xlator_t *this, int32_t event, void *data, ...)
 {
     clnt_conf_t *conf = NULL;
+    glusterfs_graph_t *graph = this->graph;
+    int ret = -1;
 
     conf = this->private;
     if (!conf)
@@ -2450,7 +2499,19 @@ notify(xlator_t *this, int32_t event, void *data, ...)
             }
             pthread_mutex_unlock(&conf->lock);
 
-            rpc_clnt_disable(conf->rpc);
+            ret = rpc_clnt_disable(conf->rpc);
+            if (ret == -1 && graph) {
+                pthread_mutex_lock(&graph->mutex);
+                {
+                    graph->parent_down++;
+                    if (graph->parent_down ==
+                        graph_total_client_xlator(graph)) {
+                        graph->used = 0;
+                        pthread_cond_broadcast(&graph->child_down_cond);
+                    }
+                }
+                pthread_mutex_unlock(&graph->mutex);
+            }
             break;
 
         default: