graph/cleanup: Fix race in graph cleanup

We were unconditionally cleaning up the grap when we get child_down followed by parent_down. But this is prone to race condition when some of the bricks are already disconnected. In this case, even before the last child down is executed in the client xlator code,we might have freed the graph. Because the child_down event is alreadt recevied. To fix this race, we have introduced a check to see if all client xlator have cleared thier reconnect chain, and called the child_down for last time. Change-Id: I7d02813bc366dac733a836e0cd7b14a6fac52042 fixes: bz#1727329 Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
author: Mohammed Rafi KC <rkavunga@redhat.com> 2019-07-05 20:12:59 +0530
committer: Amar Tumballi <amarts@gmail.com> 2019-09-05 16:14:44 +0000
commit: 43635716e6bd5bd5925fa9194b0853ee919a742d (patch)
tree: 985078d45437b1a74f119c762072fe333e92ce06
parent: d026f0bcfd301712e4f0671ccf238f43f2e6dd30 (diff)
11 files changed, 169 insertions, 12 deletions
diff --git a/libglusterfs/src/defaults-tmpl.c b/libglusterfs/src/defaults-tmpl.c
index 82e7f78d7f3..3cf707f42aa 100644
--- a/libglusterfs/src/defaults-tmpl.c
+++ b/libglusterfs/src/defaults-tmpl.c
@@ -171,8 +171,11 @@ default_notify(xlator_t *this, int32_t event, void *data, ...)
                 /* Make sure this is not a daemon with master xlator */
                 pthread_mutex_lock(&graph->mutex);
                 {
-                    graph->used = 0;
-                    pthread_cond_broadcast(&graph->child_down_cond);
+                    if (graph->parent_down ==
+                        graph_total_client_xlator(graph)) {
+                        graph->used = 0;
+                        pthread_cond_broadcast(&graph->child_down_cond);
+                    }
                 }
                 pthread_mutex_unlock(&graph->mutex);
             }
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index 01262dcd9f5..155bf435386 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -594,6 +594,7 @@ struct _glusterfs_graph {
                       in client multiplexed code path */
     pthread_mutex_t mutex;
     pthread_cond_t child_down_cond; /* for broadcasting CHILD_DOWN */
+    int parent_down;
     char graph_uuid[128];
 };
 typedef struct _glusterfs_graph glusterfs_graph_t;
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
index 6449e59f484..6608d6cdf0d 100644
--- a/libglusterfs/src/glusterfs/xlator.h
+++ b/libglusterfs/src/glusterfs/xlator.h
@@ -1095,4 +1095,6 @@ mgmt_is_multiplexed_daemon(char *name);
 
 gf_boolean_t
 xlator_is_cleanup_starting(xlator_t *this);
+int
+graph_total_client_xlator(glusterfs_graph_t *graph);
 #endif /* _XLATOR_H */
diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c
index bbc5ad68d94..e6ae40db2ed 100644
--- a/libglusterfs/src/graph.c
+++ b/libglusterfs/src/graph.c
@@ -1695,6 +1695,7 @@ glusterfs_process_svc_attach_volfp(glusterfs_ctx_t *ctx, FILE *fp,
                "failed to construct the graph");
         goto out;
     }
+    graph->parent_down = 0;
     graph->last_xl = glusterfs_get_last_xlator(graph);
 
     for (xl = graph->first; xl; xl = xl->next) {
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
index 2e83d3f1003..dc7382ba749 100644
--- a/libglusterfs/src/libglusterfs.sym
+++ b/libglusterfs/src/libglusterfs.sym
@@ -1169,3 +1169,4 @@ glusterfs_process_svc_detach
 mgmt_is_multiplexed_daemon
 xlator_is_cleanup_starting
 gf_nanosleep
+graph_total_client_xlator
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index 9906809f7aa..8605fbd0e6f 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -1542,3 +1542,26 @@ xlator_is_cleanup_starting(xlator_t *this)
 out:
     return cleanup;
 }
+
+int
+graph_total_client_xlator(glusterfs_graph_t *graph)
+{
+    xlator_t *xl = NULL;
+    int count = 0;
+
+    if (!graph) {
+        gf_msg("xlator", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG,
+               "graph object is null");
+        goto out;
+    }
+
+    xl = graph->first;
+    while (xl) {
+        if (strcmp(xl->type, "protocol/client") == 0) {
+            count++;
+        }
+        xl = xl->next;
+    }
+out:
+    return count;
+}
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index 8ef05378351..aa65a1f8766 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -1858,7 +1858,7 @@ rpc_clnt_unref(struct rpc_clnt *rpc)
     return rpc;
 }
 
-void
+int
 rpc_clnt_disable(struct rpc_clnt *rpc)
 {
     rpc_clnt_connection_t *conn = NULL;
@@ -1902,8 +1902,9 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
     }
     pthread_mutex_unlock(&conn->lock);
 
+    ret = -1;
     if (trans) {
-        rpc_transport_disconnect(trans, _gf_true);
+        ret = rpc_transport_disconnect(trans, _gf_true);
         /* The auth_value was being reset to AUTH_GLUSTERFS_v2.
          *    if (clnt->auth_value)
          *           clnt->auth_value = AUTH_GLUSTERFS_v2;
@@ -1919,7 +1920,6 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
          * on a connected transport and hence its strictly serialized.
          */
     }
-
     if (unref)
         rpc_clnt_unref(rpc);
 
@@ -1930,7 +1930,7 @@ rpc_clnt_disable(struct rpc_clnt *rpc)
         rpc_clnt_unref(rpc);
 
 out:
-    return;
+    return ret;
 }
 
 void
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index b46feed50c8..1d3274bbddd 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -250,7 +250,7 @@ int
 rpcclnt_cbk_program_register(struct rpc_clnt *svc,
                              rpcclnt_cb_program_t *program, void *mydata);
 
-void
+int
 rpc_clnt_disable(struct rpc_clnt *rpc);
 
 int
diff --git a/tests/basic/graph-cleanup-brick-down-shd-mux.t b/tests/basic/graph-cleanup-brick-down-shd-mux.t
new file mode 100644
index 00000000000..3c621cdcc26
--- /dev/null
+++ b/tests/basic/graph-cleanup-brick-down-shd-mux.t
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TESTS_EXPECTED_IN_LOOP=4
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2,3,4,5}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 performance.flush-behind off
+TEST $CLI volume start $V0
+
+for i in $(seq 1 2); do
+   TEST $CLI volume create ${V0}_afr$i replica 3 $H0:$B0/${V0}_afr${i}{0,1,2,3,4,5}
+   TEST $CLI volume start ${V0}_afr$i
+   TEST $CLI volume create ${V0}_ec$i disperse 6 redundancy 2 $H0:$B0/${V0}_ec${i}{0,1,2,3,4,5}
+   TEST $CLI volume start ${V0}_ec$i
+done
+
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
+#Check the thread count become to number of volumes*number of ec subvolume (2*6=12)
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "ec_shd_index_healer"
+#Check the thread count become to number of volumes*number of afr subvolume (3*6=18)
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "afr_shd_index_healer"
+
+#kill one brick and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+#kill an entire subvol and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+#wait for some time to create a race sceanrio
+sleep 1
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+#kill all bricks and test cleanup
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST kill_brick $V0 $H0 $B0/${V0}3
+TEST kill_brick $V0 $H0 $B0/${V0}4
+TEST kill_brick $V0 $H0 $B0/${V0}5
+#wait for some time to create a race sceanrio
+sleep 2
+
+TEST $CLI volume stop $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer"
+
+cleanup
diff --git a/tests/basic/volume-scale-shd-mux.t b/tests/basic/volume-scale-shd-mux.t
index 89b833d5ddc..d1ddcbca7dd 100644
--- a/tests/basic/volume-scale-shd-mux.t
+++ b/tests/basic/volume-scale-shd-mux.t
@@ -23,8 +23,6 @@ for i in $(seq 1 2); do
 done
 
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
-
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count
 #Check the thread count become to number of volumes*number of ec subvolume (2*6=12)
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "__ec_shd_healer_wait"
 #Check the thread count become to number of volumes*number of afr subvolume (3*6=18)
@@ -38,9 +36,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^21$" number_healer_threads_shd $V0 "__afr_sh
 
 #Remove the brick and check the detach is successful
 $CLI volume remove-brick $V0 $H0:$B0/${V0}{6,7,8} force
-
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "__afr_shd_healer_wait"
 
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup"
 TEST $CLI volume add-brick ${V0}_ec1 $H0:$B0/${V0}_ec1_add{0,1,2,3,4,5};
 #Check the thread count become to number of volumes*number of ec subvolume plus 2 additional threads from newly added bricks (2*6+6=18)
 
@@ -92,6 +90,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^9$" number_healer_threads_shd $V0 "__afr_shd
 TEST $CLI volume remove-brick ${V0}_distribute1 replica 1 $H0:$B0/add/{2..3} force
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^6$" number_healer_threads_shd $V0 "__afr_shd_healer_wait"
 
+#Before stopping the process, make sure there is no pending clenup threads hanging
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup"
+
 TEST $CLI volume stop ${V0}
 TEST $CLI volume delete ${V0}
 EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" shd_count
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 776e7160c51..45e7bfedf91 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -61,9 +61,54 @@ out:
 }
 
 int
+client_is_last_child_down(xlator_t *this, int32_t event, struct rpc_clnt *rpc)
+{
+    rpc_clnt_connection_t *conn = NULL;
+    int ret = 0;
+
+    clnt_conf_t *conf = this->private;
+    if (!this || !rpc || !conf)
+        goto out;
+
+    if (!conf->parent_down)
+        goto out;
+    conn = &rpc->conn;
+    pthread_mutex_lock(&conn->lock);
+    {
+        if (event == GF_EVENT_CHILD_DOWN && !conn->reconnect && rpc->disabled) {
+            ret = 1;
+        }
+    }
+    pthread_mutex_unlock(&conn->lock);
+out:
+    return ret;
+}
+
+int
 client_notify_dispatch_uniq(xlator_t *this, int32_t event, void *data, ...)
 {
     clnt_conf_t *conf = this->private;
+    glusterfs_ctx_t *ctx = this->ctx;
+    glusterfs_graph_t *graph = this->graph;
+
+    pthread_mutex_lock(&ctx->notify_lock);
+    {
+        while (ctx->notifying)
+            pthread_cond_wait(&ctx->notify_cond, &ctx->notify_lock);
+
+        if (client_is_last_child_down(this, event, data) && graph) {
+            pthread_mutex_lock(&graph->mutex);
+            {
+                graph->parent_down++;
+                if (graph->parent_down == graph_total_client_xlator(graph)) {
+                    graph->used = 0;
+                    pthread_cond_broadcast(&graph->child_down_cond);
+                }
+            }
+            pthread_mutex_unlock(&graph->mutex);
+        }
+    }
+    pthread_mutex_unlock(&ctx->notify_lock);
 
     if (conf->last_sent_event == event)
         return 0;
@@ -81,6 +126,7 @@ client_notify_dispatch(xlator_t *this, int32_t event, void *data, ...)
 {
     int ret = -1;
     glusterfs_ctx_t *ctx = this->ctx;
+
     clnt_conf_t *conf = this->private;
 
     pthread_mutex_lock(&ctx->notify_lock);
@@ -94,6 +140,7 @@ client_notify_dispatch(xlator_t *this, int32_t event, void *data, ...)
     /* We assume that all translators in the graph handle notification
      * events in sequence.
      * */
+
     ret = default_notify(this, event, data);
 
     /* NB (Even) with MT-epoll and EPOLLET|EPOLLONESHOT we are guaranteed
@@ -2376,7 +2423,7 @@ client_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
                    replicate), hence make sure events which are passed
                    to parent are genuine */
                 ret = client_notify_dispatch_uniq(this, GF_EVENT_CHILD_DOWN,
-                                                  NULL);
+                                                  rpc);
                 if (is_parent_down) {
                     /* If parent is down, then there should not be any
                      * operation after a child down.
@@ -2424,6 +2471,8 @@ int
 notify(xlator_t *this, int32_t event, void *data, ...)
 {
     clnt_conf_t *conf = NULL;
+    glusterfs_graph_t *graph = this->graph;
+    int ret = -1;
 
     conf = this->private;
     if (!conf)
@@ -2450,7 +2499,19 @@ notify(xlator_t *this, int32_t event, void *data, ...)
             }
             pthread_mutex_unlock(&conf->lock);
 
-            rpc_clnt_disable(conf->rpc);
+            ret = rpc_clnt_disable(conf->rpc);
+            if (ret == -1 && graph) {
+                pthread_mutex_lock(&graph->mutex);
+                {
+                    graph->parent_down++;
+                    if (graph->parent_down ==
+                        graph_total_client_xlator(graph)) {
+                        graph->used = 0;
+                        pthread_cond_broadcast(&graph->child_down_cond);
+                    }
+                }
+                pthread_mutex_unlock(&graph->mutex);
+            }
             break;
 
         default:
author	Mohammed Rafi KC <rkavunga@redhat.com>	2019-07-05 20:12:59 +0530
committer	Amar Tumballi <amarts@gmail.com>	2019-09-05 16:14:44 +0000
commit	43635716e6bd5bd5925fa9194b0853ee919a742d (patch)
tree	985078d45437b1a74f119c762072fe333e92ce06
parent	d026f0bcfd301712e4f0671ccf238f43f2e6dd30 (diff)