diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2017-03-20 12:32:33 -0400 | 
|---|---|---|
| committer | Raghavendra Talur <rtalur@redhat.com> | 2017-04-24 15:49:00 +0000 | 
| commit | e6045103b7e010779549bb486c00a07b3c3eb0fc (patch) | |
| tree | 74a7d99741869208f9a2d54d18b8e3d9b8665959 /tests | |
| parent | e37d546042a73bec85fe2ebd0146b88b7079bc66 (diff) | |
glusterd: hold off volume deletes while still restarting bricks
We need to do this because modifying the volume/brick tree while
glusterd_restart_bricks is still walking it can lead to segfaults.
Without waiting we could accidentally "slip in" while attach_brick has
released big_lock between retries and make such a modification.
Backport of:
> Commit a7ce0548b7969050644891cd90c0bf134fa1594c
> BUG: 1432542
> Reviewed-on: https://review.gluster.org/16927
Change-Id: I30ccc4efa8d286aae847250f5d4fb28956a74b03
BUG: 1441476
Signed-off-by: Jeff Darcy <jeff@pl.atyp.us>
Reviewed-on: https://review.gluster.org/17044
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t (renamed from tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t) | 5 | ||||
| -rw-r--r-- | tests/bugs/core/bug-1432542-mpx-restart-crash.t | 91 | 
2 files changed, 96 insertions, 0 deletions
diff --git a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t index ed401f6e6ad..a227f8275ed 100644 --- a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t +++ b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t @@ -21,6 +21,11 @@ push_trapfunc "cleanup"  TEST $CLI volume create $V0 $H0:$B0/brick{0,1}  TEST $CLI volume start $V0 +# We can't expect a valid port number instantly.  We need to wait for the +# bricks to finish coming up.  In every other case we use EXPECT_WITHIN, but +# this first time we need to wait more explicitly. +sleep $PROCESS_UP_TIMEOUT +  port_brick0=$(get_nth_brick_port_for_volume $V0 1)  # restart the volume diff --git a/tests/bugs/core/bug-1432542-mpx-restart-crash.t b/tests/bugs/core/bug-1432542-mpx-restart-crash.t new file mode 100644 index 00000000000..970a181c83d --- /dev/null +++ b/tests/bugs/core/bug-1432542-mpx-restart-crash.t @@ -0,0 +1,91 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../traps.rc + +NUM_VOLS=20 +MOUNT_BASE=$(dirname $M0) + +# GlusterD reports that bricks are started when in fact their attach requests +# might still need to be retried.  That's a bit of a hack, but there's no +# feasible way to wait at that point (in attach_brick) and the rest of the +# code is unprepared to deal with transient errors so the whole "brick start" +# would fail.  Meanwhile, glusterfsd can only handle attach requests at a +# rather slow rate.  After GlusterD tries to start a couple of hundred bricks, +# glusterfsd can fall behind and we start getting mount failures.  Arguably, +# those are spurious because we will eventually catch up.  We're just not +# ready *yet*.  More to the point, even if the errors aren't spurious that's +# not what we're testing right now.  Therefore, we give glusterfsd a bit more +# breathing room for this test than we would otherwise. +MOUNT_TIMEOUT=15 + +get_brick_base () { +	printf "%s/vol%02d" $B0 $1 +} + +get_mount_point () { +	printf "%s/vol%02d" $MOUNT_BASE $1 +} + +create_volume () { + +	local vol_name=$(printf "%s-vol%02d" $V0 $1) + +	local brick_base=$(get_brick_base $1) +	local cmd="$CLI volume create $vol_name replica 2" +	local b +	for b in $(seq 0 5); do +		local this_brick=${brick_base}/brick$b +		mkdir -p $this_brick +		cmd="$cmd $H0:$this_brick" +	done +	TEST $cmd +	TEST $CLI volume start $vol_name +	EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $vol_name "Status" +	local mount_point=$(get_mount_point $1) +	mkdir -p $mount_point +	TEST $GFS -s $H0 --volfile-id=$vol_name $mount_point +} + +cleanup_func () { +	local v +	for v in $(seq 1 $NUM_VOLS); do +		local mount_point=$(get_mount_point $v) +		force_umount $mount_point +		rm -rf $mount_point +		local vol_name=$(printf "%s-vol%02d" $V0 $v) +		$CLI volume stop $vol_name +		$CLI volume delete $vol_name +		rm -rf $(get_brick_base $1) & +	done &> /dev/null +	wait +} +push_trapfunc cleanup_func + +TEST glusterd +TEST $CLI volume set all cluster.brick-multiplex on + +# Our infrastructure can't handle an arithmetic expression here.  The formula +# is (NUM_VOLS-1)*5 because it sees each TEST/EXPECT once but needs the other +# NUM_VOLS-1 and there are 5 such statements in each iteration. +TESTS_EXPECTED_IN_LOOP=95 +for i in $(seq 1 $NUM_VOLS); do +	create_volume $i +	TEST dd if=/dev/zero of=$(get_mount_point $i)/a_file bs=4k count=1 +done + +# Kill glusterd, and wait a bit for all traces to disappear. +TEST killall -9 glusterd +sleep 5 +TEST killall -9 glusterfsd +sleep 5 + +# Restart glusterd.  This is where the brick daemon supposedly dumps core, +# though I (jdarcy) have yet to see that.  Again, give it a while to settle, +# just to be sure. +TEST glusterd + +cleanup_func +trap - EXIT +cleanup  | 
