diff options
author | Mohammed Rafi KC <rkavunga@redhat.com> | 2019-07-05 20:12:59 +0530 |
---|---|---|
committer | Amar Tumballi <amarts@gmail.com> | 2019-09-05 16:14:44 +0000 |
commit | 43635716e6bd5bd5925fa9194b0853ee919a742d (patch) | |
tree | 985078d45437b1a74f119c762072fe333e92ce06 /tests | |
parent | d026f0bcfd301712e4f0671ccf238f43f2e6dd30 (diff) |
graph/cleanup: Fix race in graph cleanup
We were unconditionally cleaning up the grap when we get
child_down followed by parent_down. But this is prone to
race condition when some of the bricks are already disconnected.
In this case, even before the last child down is executed in the
client xlator code,we might have freed the graph. Because the
child_down event is alreadt recevied.
To fix this race, we have introduced a check to see if all client
xlator have cleared thier reconnect chain, and called the child_down
for last time.
Change-Id: I7d02813bc366dac733a836e0cd7b14a6fac52042
fixes: bz#1727329
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
Diffstat (limited to 'tests')
-rw-r--r-- | tests/basic/graph-cleanup-brick-down-shd-mux.t | 64 | ||||
-rw-r--r-- | tests/basic/volume-scale-shd-mux.t | 7 |
2 files changed, 68 insertions, 3 deletions
diff --git a/tests/basic/graph-cleanup-brick-down-shd-mux.t b/tests/basic/graph-cleanup-brick-down-shd-mux.t new file mode 100644 index 00000000000..3c621cdcc26 --- /dev/null +++ b/tests/basic/graph-cleanup-brick-down-shd-mux.t @@ -0,0 +1,64 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TESTS_EXPECTED_IN_LOOP=4 + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2,3,4,5} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 performance.flush-behind off +TEST $CLI volume start $V0 + +for i in $(seq 1 2); do + TEST $CLI volume create ${V0}_afr$i replica 3 $H0:$B0/${V0}_afr${i}{0,1,2,3,4,5} + TEST $CLI volume start ${V0}_afr$i + TEST $CLI volume create ${V0}_ec$i disperse 6 redundancy 2 $H0:$B0/${V0}_ec${i}{0,1,2,3,4,5} + TEST $CLI volume start ${V0}_ec$i +done + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count +#Check the thread count become to number of volumes*number of ec subvolume (2*6=12) +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "ec_shd_index_healer" +#Check the thread count become to number of volumes*number of afr subvolume (3*6=18) +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "afr_shd_index_healer" + +#kill one brick and test cleanup +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST $CLI volume stop $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" + +#kill an entire subvol and test cleanup +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +#wait for some time to create a race sceanrio +sleep 1 +TEST $CLI volume stop $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" + +#kill all bricks and test cleanup +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST kill_brick $V0 $H0 $B0/${V0}3 +TEST kill_brick $V0 $H0 $B0/${V0}4 +TEST kill_brick $V0 $H0 $B0/${V0}5 +#wait for some time to create a race sceanrio +sleep 2 + +TEST $CLI volume stop $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd ${V0}_afr1 "afr_shd_index_healer" + +cleanup diff --git a/tests/basic/volume-scale-shd-mux.t b/tests/basic/volume-scale-shd-mux.t index 89b833d5ddc..d1ddcbca7dd 100644 --- a/tests/basic/volume-scale-shd-mux.t +++ b/tests/basic/volume-scale-shd-mux.t @@ -23,8 +23,6 @@ for i in $(seq 1 2); do done EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count - -EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" shd_count #Check the thread count become to number of volumes*number of ec subvolume (2*6=12) EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^12$" number_healer_threads_shd $V0 "__ec_shd_healer_wait" #Check the thread count become to number of volumes*number of afr subvolume (3*6=18) @@ -38,9 +36,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^21$" number_healer_threads_shd $V0 "__afr_sh #Remove the brick and check the detach is successful $CLI volume remove-brick $V0 $H0:$B0/${V0}{6,7,8} force - EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^18$" number_healer_threads_shd $V0 "__afr_shd_healer_wait" +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup" TEST $CLI volume add-brick ${V0}_ec1 $H0:$B0/${V0}_ec1_add{0,1,2,3,4,5}; #Check the thread count become to number of volumes*number of ec subvolume plus 2 additional threads from newly added bricks (2*6+6=18) @@ -92,6 +90,9 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^9$" number_healer_threads_shd $V0 "__afr_shd TEST $CLI volume remove-brick ${V0}_distribute1 replica 1 $H0:$B0/add/{2..3} force EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^6$" number_healer_threads_shd $V0 "__afr_shd_healer_wait" +#Before stopping the process, make sure there is no pending clenup threads hanging +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" number_healer_threads_shd $V0 "glusterfs_graph_cleanup" + TEST $CLI volume stop ${V0} TEST $CLI volume delete ${V0} EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "^0$" shd_count |