From a7ce0548b7969050644891cd90c0bf134fa1594c Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Mon, 20 Mar 2017 12:32:33 -0400 Subject: glusterd: hold off volume deletes while still restarting bricks We need to do this because modifying the volume/brick tree while glusterd_restart_bricks is still walking it can lead to segfaults. Without waiting we could accidentally "slip in" while attach_brick has released big_lock between retries and make such a modification. Change-Id: I30ccc4efa8d286aae847250f5d4fb28956a74b03 BUG: 1432542 Signed-off-by: Jeff Darcy Reviewed-on: https://review.gluster.org/16927 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Atin Mukherjee --- .../bugs/core/bug-1421590-brick-mux-resuse-ports.t | 55 ------------- .../bugs/core/bug-1421590-brick-mux-reuse-ports.t | 60 ++++++++++++++ tests/bugs/core/bug-1432542-mpx-restart-crash.t | 91 ++++++++++++++++++++++ xlators/mgmt/glusterd/src/glusterd-op-sm.c | 15 ++++ xlators/mgmt/glusterd/src/glusterd-utils.c | 39 +++++++--- xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 3 - xlators/mgmt/glusterd/src/glusterd.c | 1 + xlators/mgmt/glusterd/src/glusterd.h | 1 + 8 files changed, 196 insertions(+), 69 deletions(-) delete mode 100644 tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t create mode 100644 tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t create mode 100644 tests/bugs/core/bug-1432542-mpx-restart-crash.t diff --git a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t b/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t deleted file mode 100644 index ed401f6e6ad..00000000000 --- a/tests/bugs/core/bug-1421590-brick-mux-resuse-ports.t +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../traps.rc -. $(dirname $0)/../../volume.rc - -function get_nth_brick_port_for_volume () { - local VOL=$1 - local n=$2 - - $CLI volume status $VOL --xml | sed -ne 's/.*\([-0-9]*\)<\/port>/\1/p' \ - | head -n $n | tail -n 1 -} - -TEST glusterd - -TEST $CLI volume set all cluster.brick-multiplex on -push_trapfunc "$CLI volume set all cluster.brick-multiplex off" -push_trapfunc "cleanup" - -TEST $CLI volume create $V0 $H0:$B0/brick{0,1} -TEST $CLI volume start $V0 - -port_brick0=$(get_nth_brick_port_for_volume $V0 1) - -# restart the volume -TEST $CLI volume stop $V0 -TEST $CLI volume start $V0 - -EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 - -TEST $CLI volume stop $V0 -TEST $CLI volume set all cluster.brick-multiplex off - -TEST $CLI volume start $V0 - -EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 - -port_brick1=$(get_nth_brick_port_for_volume $V0 2) - -# restart the volume -TEST $CLI volume stop $V0 -TEST $CLI volume start $V0 - -EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 -EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick1 get_nth_brick_port_for_volume $V0 2 - -TEST $CLI volume stop $V0 - -TEST $CLI volume set all cluster.brick-multiplex on - -TEST $CLI volume start $V0 - -EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 - diff --git a/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t new file mode 100644 index 00000000000..a227f8275ed --- /dev/null +++ b/tests/bugs/core/bug-1421590-brick-mux-reuse-ports.t @@ -0,0 +1,60 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../traps.rc +. $(dirname $0)/../../volume.rc + +function get_nth_brick_port_for_volume () { + local VOL=$1 + local n=$2 + + $CLI volume status $VOL --xml | sed -ne 's/.*\([-0-9]*\)<\/port>/\1/p' \ + | head -n $n | tail -n 1 +} + +TEST glusterd + +TEST $CLI volume set all cluster.brick-multiplex on +push_trapfunc "$CLI volume set all cluster.brick-multiplex off" +push_trapfunc "cleanup" + +TEST $CLI volume create $V0 $H0:$B0/brick{0,1} +TEST $CLI volume start $V0 + +# We can't expect a valid port number instantly. We need to wait for the +# bricks to finish coming up. In every other case we use EXPECT_WITHIN, but +# this first time we need to wait more explicitly. +sleep $PROCESS_UP_TIMEOUT + +port_brick0=$(get_nth_brick_port_for_volume $V0 1) + +# restart the volume +TEST $CLI volume stop $V0 +TEST $CLI volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 + +TEST $CLI volume stop $V0 +TEST $CLI volume set all cluster.brick-multiplex off + +TEST $CLI volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 + +port_brick1=$(get_nth_brick_port_for_volume $V0 2) + +# restart the volume +TEST $CLI volume stop $V0 +TEST $CLI volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick1 get_nth_brick_port_for_volume $V0 2 + +TEST $CLI volume stop $V0 + +TEST $CLI volume set all cluster.brick-multiplex on + +TEST $CLI volume start $V0 + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT $port_brick0 get_nth_brick_port_for_volume $V0 1 + diff --git a/tests/bugs/core/bug-1432542-mpx-restart-crash.t b/tests/bugs/core/bug-1432542-mpx-restart-crash.t new file mode 100644 index 00000000000..970a181c83d --- /dev/null +++ b/tests/bugs/core/bug-1432542-mpx-restart-crash.t @@ -0,0 +1,91 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../traps.rc + +NUM_VOLS=20 +MOUNT_BASE=$(dirname $M0) + +# GlusterD reports that bricks are started when in fact their attach requests +# might still need to be retried. That's a bit of a hack, but there's no +# feasible way to wait at that point (in attach_brick) and the rest of the +# code is unprepared to deal with transient errors so the whole "brick start" +# would fail. Meanwhile, glusterfsd can only handle attach requests at a +# rather slow rate. After GlusterD tries to start a couple of hundred bricks, +# glusterfsd can fall behind and we start getting mount failures. Arguably, +# those are spurious because we will eventually catch up. We're just not +# ready *yet*. More to the point, even if the errors aren't spurious that's +# not what we're testing right now. Therefore, we give glusterfsd a bit more +# breathing room for this test than we would otherwise. +MOUNT_TIMEOUT=15 + +get_brick_base () { + printf "%s/vol%02d" $B0 $1 +} + +get_mount_point () { + printf "%s/vol%02d" $MOUNT_BASE $1 +} + +create_volume () { + + local vol_name=$(printf "%s-vol%02d" $V0 $1) + + local brick_base=$(get_brick_base $1) + local cmd="$CLI volume create $vol_name replica 2" + local b + for b in $(seq 0 5); do + local this_brick=${brick_base}/brick$b + mkdir -p $this_brick + cmd="$cmd $H0:$this_brick" + done + TEST $cmd + TEST $CLI volume start $vol_name + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $vol_name "Status" + local mount_point=$(get_mount_point $1) + mkdir -p $mount_point + TEST $GFS -s $H0 --volfile-id=$vol_name $mount_point +} + +cleanup_func () { + local v + for v in $(seq 1 $NUM_VOLS); do + local mount_point=$(get_mount_point $v) + force_umount $mount_point + rm -rf $mount_point + local vol_name=$(printf "%s-vol%02d" $V0 $v) + $CLI volume stop $vol_name + $CLI volume delete $vol_name + rm -rf $(get_brick_base $1) & + done &> /dev/null + wait +} +push_trapfunc cleanup_func + +TEST glusterd +TEST $CLI volume set all cluster.brick-multiplex on + +# Our infrastructure can't handle an arithmetic expression here. The formula +# is (NUM_VOLS-1)*5 because it sees each TEST/EXPECT once but needs the other +# NUM_VOLS-1 and there are 5 such statements in each iteration. +TESTS_EXPECTED_IN_LOOP=95 +for i in $(seq 1 $NUM_VOLS); do + create_volume $i + TEST dd if=/dev/zero of=$(get_mount_point $i)/a_file bs=4k count=1 +done + +# Kill glusterd, and wait a bit for all traces to disappear. +TEST killall -9 glusterd +sleep 5 +TEST killall -9 glusterfsd +sleep 5 + +# Restart glusterd. This is where the brick daemon supposedly dumps core, +# though I (jdarcy) have yet to see that. Again, give it a while to settle, +# just to be sure. +TEST glusterd + +cleanup_func +trap - EXIT +cleanup diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 55c2bda1dd1..3558ae91059 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -6044,6 +6044,15 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr, return ret; } +static void +glusterd_wait_for_blockers (glusterd_conf_t *priv) +{ + while (priv->blockers) { + synclock_unlock (&priv->big_lock); + sleep (1); + synclock_lock (&priv->big_lock); + } +} int32_t glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, @@ -6063,18 +6072,22 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, break; case GD_OP_STOP_VOLUME: + glusterd_wait_for_blockers (this->private); ret = glusterd_op_stop_volume (dict); break; case GD_OP_DELETE_VOLUME: + glusterd_wait_for_blockers (this->private); ret = glusterd_op_delete_volume (dict); break; case GD_OP_ADD_BRICK: + glusterd_wait_for_blockers (this->private); ret = glusterd_op_add_brick (dict, op_errstr); break; case GD_OP_REPLACE_BRICK: + glusterd_wait_for_blockers (this->private); ret = glusterd_op_replace_brick (dict, rsp_dict); break; @@ -6082,11 +6095,13 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_op_set_volume (dict, op_errstr); break; + case GD_OP_RESET_VOLUME: ret = glusterd_op_reset_volume (dict, op_errstr); break; case GD_OP_REMOVE_BRICK: + glusterd_wait_for_blockers (this->private); ret = glusterd_op_remove_brick (dict, op_errstr); break; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index f8525611ed4..a0df27c8bae 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -3134,8 +3134,8 @@ out: int glusterd_spawn_daemons (void *opaque) { - glusterd_conf_t *conf = THIS->private; - int ret = -1; + glusterd_conf_t *conf = THIS->private; + int ret = -1; synclock_lock (&conf->big_lock); glusterd_restart_bricks (conf); @@ -4904,9 +4904,13 @@ static int32_t my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame) { call_frame_t *frame = v_frame; + glusterd_conf_t *conf = frame->this->private; - STACK_DESTROY (frame->root); + synclock_lock (&conf->big_lock); + --(conf->blockers); + synclock_unlock (&conf->big_lock); + STACK_DESTROY (frame->root); return 0; } @@ -4923,6 +4927,7 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op) void *req = &brick_req; void *errlbl = &&err; struct rpc_clnt_connection *conn; + glusterd_conf_t *conf = this->private; extern struct rpc_clnt_program gd_brick_prog; if (!rpc) { @@ -4982,9 +4987,13 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op) iov.iov_len = ret; /* Send the msg */ + ++(conf->blockers); ret = rpc_clnt_submit (rpc, &gd_brick_prog, op, - my_callback, &iov, 1, NULL, 0, iobref, frame, - NULL, 0, NULL, 0, NULL); + my_callback, &iov, 1, NULL, 0, iobref, + frame, NULL, 0, NULL, 0, NULL); + if (ret) { + --(conf->blockers); + } return ret; free_iobref: @@ -5016,6 +5025,8 @@ attach_brick (xlator_t *this, char full_id[PATH_MAX] = {'\0',}; char path[PATH_MAX] = {'\0',}; int ret; + int tries; + rpc_clnt_t *rpc; gf_log (this->name, GF_LOG_INFO, "add brick %s to existing process for %s", @@ -5052,12 +5063,15 @@ attach_brick (xlator_t *this, } (void) build_volfile_path (full_id, path, sizeof(path), NULL); - int tries = 0; - while (tries++ <= 15) { - ret = send_attach_req (this, other_brick->rpc, path, - GLUSTERD_BRICK_ATTACH); - if (!ret) { - return 0; + for (tries = 15; tries > 0; --tries) { + rpc = rpc_clnt_ref (other_brick->rpc); + if (rpc) { + ret = send_attach_req (this, rpc, path, + GLUSTERD_BRICK_ATTACH); + rpc_clnt_unref (rpc); + if (!ret) { + return 0; + } } /* * It might not actually be safe to manipulate the lock like @@ -5423,6 +5437,8 @@ glusterd_restart_bricks (glusterd_conf_t *conf) conf = this->private; GF_VALIDATE_OR_GOTO (this->name, conf, out); + ++(conf->blockers); + ret = glusterd_get_quorum_cluster_counts (this, &active_count, &quorum_count); if (ret) @@ -5502,6 +5518,7 @@ glusterd_restart_bricks (glusterd_conf_t *conf) ret = 0; out: + --(conf->blockers); conf->restart_done = _gf_true; return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index d7983f6fd71..5d164fd8df3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -2761,14 +2761,11 @@ glusterd_op_delete_volume (dict_t *dict) { int ret = 0; char *volname = NULL; - glusterd_conf_t *priv = NULL; glusterd_volinfo_t *volinfo = NULL; xlator_t *this = NULL; this = THIS; GF_ASSERT (this); - priv = this->private; - GF_ASSERT (priv); ret = dict_get_str (dict, "volname", &volname); if (ret) { diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index 11d4b8bc79f..d26e959d3a3 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -1867,6 +1867,7 @@ init (xlator_t *this) if (ret < 0) goto out; + conf->blockers = 0; /* If the peer count is less than 2 then this would be the best time to * spawn process/bricks that may need (re)starting since last time * (this) glusterd was up. */ diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index e5b7b9f8f38..a773a2b6221 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -190,6 +190,7 @@ typedef struct { int ping_timeout; uint32_t generation; int32_t workers; + uint32_t blockers; } glusterd_conf_t; -- cgit