tests/bugs/core/bug-1432542-mpx-restart-crash.t


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

#!/bin/bash

. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../traps.rc

NUM_VOLS=20
MOUNT_BASE=$(dirname $M0)

# GlusterD reports that bricks are started when in fact their attach requests
# might still need to be retried.  That's a bit of a hack, but there's no
# feasible way to wait at that point (in attach_brick) and the rest of the
# code is unprepared to deal with transient errors so the whole "brick start"
# would fail.  Meanwhile, glusterfsd can only handle attach requests at a
# rather slow rate.  After GlusterD tries to start a couple of hundred bricks,
# glusterfsd can fall behind and we start getting mount failures.  Arguably,
# those are spurious because we will eventually catch up.  We're just not
# ready *yet*.  More to the point, even if the errors aren't spurious that's
# not what we're testing right now.  Therefore, we give glusterfsd a bit more
# breathing room for this test than we would otherwise.
MOUNT_TIMEOUT=15

get_brick_base () {
	printf "%s/vol%02d" $B0 $1
}

get_mount_point () {
	printf "%s/vol%02d" $MOUNT_BASE $1
}

create_volume () {

	local vol_name=$(printf "%s-vol%02d" $V0 $1)

	local brick_base=$(get_brick_base $1)
	local cmd="$CLI volume create $vol_name replica 2"
	local b
	for b in $(seq 0 5); do
		local this_brick=${brick_base}/brick$b
		mkdir -p $this_brick
		cmd="$cmd $H0:$this_brick"
	done
	TEST $cmd
	TEST $CLI volume start $vol_name
	EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $vol_name "Status"
	local mount_point=$(get_mount_point $1)
	mkdir -p $mount_point
	TEST $GFS -s $H0 --volfile-id=$vol_name $mount_point
}

cleanup_func () {
	local v
	for v in $(seq 1 $NUM_VOLS); do
		local mount_point=$(get_mount_point $v)
		force_umount $mount_point
		rm -rf $mount_point
		local vol_name=$(printf "%s-vol%02d" $V0 $v)
		$CLI volume stop $vol_name
		$CLI volume delete $vol_name
		rm -rf $(get_brick_base $1) &
	done &> /dev/null
	wait
}
push_trapfunc cleanup_func

TEST glusterd
TEST $CLI volume set all cluster.brick-multiplex on

# Our infrastructure can't handle an arithmetic expression here.  The formula
# is (NUM_VOLS-1)*5 because it sees each TEST/EXPECT once but needs the other
# NUM_VOLS-1 and there are 5 such statements in each iteration.
TESTS_EXPECTED_IN_LOOP=95
for i in $(seq 1 $NUM_VOLS); do
	create_volume $i
	TEST dd if=/dev/zero of=$(get_mount_point $i)/a_file bs=4k count=1
done

# Kill glusterd, and wait a bit for all traces to disappear.
TEST killall -9 glusterd
sleep 5
TEST killall -9 glusterfsd
sleep 5

# Restart glusterd.  This is where the brick daemon supposedly dumps core,
# though I (jdarcy) have yet to see that.  Again, give it a while to settle,
# just to be sure.
TEST glusterd

cleanup_func
trap - EXIT
cleanup