From 46d333783a968ab39e0beade9c7a1eec8035f8b1 Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Mon, 31 Mar 2014 18:37:38 +0000 Subject: nsr: add quorum enforcement Change-Id: I0241f8c1ac97c80ae438e3d9f1ac492d63da9347 Signed-off-by: Jeff Darcy --- tests/basic/quorum.t | 64 +++++++++++++++++++++++++ tests/basic/recon.t | 4 ++ xlators/cluster/nsr-server/src/all-templates.c | 25 ++++++++++ xlators/cluster/nsr-server/src/nsr-internal.h | 1 + xlators/cluster/nsr-server/src/nsr.c | 8 ++++ xlators/cluster/nsr-server/src/recon_notify.c | 10 +++- xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 +++ 7 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 tests/basic/quorum.t diff --git a/tests/basic/quorum.t b/tests/basic/quorum.t new file mode 100644 index 000000000..b8fc9cf3a --- /dev/null +++ b/tests/basic/quorum.t @@ -0,0 +1,64 @@ +#!/bin/bash + +# Test *very basic* NSR functionality - startup, mount, simplest possible file +# write. + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function get_rep_count { + v=$(getfattr --only-values -e text -n trusted.nsr.rep-count $1 2> /dev/null) + #echo $v > /dev/tty + echo $v +} + +function kill_a_brick { + for r in /var/lib/glusterd/vols/${V0}/run/*-recon.pid; do + rpid=$(cat $r) + #echo "recon PID = $rpid" > /dev/tty + b=$(echo $r | sed '/\(.*\):\(.*\)-recon.pid/s//\1\2.pid/') + bpid=$(cat $b) + #echo "brick PID = $bpid" > /dev/tty + kill -9 $bpid $rpid + return 0 + done + + # No bricks?!? + return 1 +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2} + +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' +EXPECT '2' brick_count $V0 + +TEST $CLI volume set $V0 cluster.nsr on +TEST $CLI volume set $V0 cluster.nsr.recon on + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +## Mount FUSE with caching disabled (read-only) +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Give the bricks a chance to connect to each other. +EXPECT_WITHIN 10 "2" get_rep_count $M0 + +TEST kill_a_brick +EXPECT_WITHIN 10 "1" get_rep_count $M0 + +# Make sure writes fail while degraded. +tmpfile=$(mktemp) +trap "rm $tmpfile" EXIT +dd if=/dev/urandom of=$M0/probe bs=4k count=100 status=none 2> $tmpfile +TEST [ x"$?" != x"0" ] +TEST grep -qs 'Read-only file system' $tmpfile + +cleanup diff --git a/tests/basic/recon.t b/tests/basic/recon.t index fac454530..e0fbea749 100755 --- a/tests/basic/recon.t +++ b/tests/basic/recon.t @@ -119,6 +119,10 @@ EXPECT '2' brick_count $V0 TEST $CLI volume set $V0 cluster.nsr on TEST $CLI volume set $V0 cluster.nsr.recon on +# This would normally be a terrible idea, but it's handy for issuing ops that +# will have to be reconciled later. +TEST $CLI volume set $V0 cluster.nsr.quorum-percent 0 + TEST $CLI volume start $V0 EXPECT 'Started' volinfo_field $V0 'Status' diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c index 2f0509a6c..fa29de7b2 100644 --- a/xlators/cluster/nsr-server/src/all-templates.c +++ b/xlators/cluster/nsr-server/src/all-templates.c @@ -59,6 +59,31 @@ nsr_$NAME$ (call_frame_t *frame, xlator_t *this, int from_leader; int from_recon; uint32_t ti = 0; + double must_be_up; + double are_up; + + /* + * Our first goal here is to avoid "split brain surprise" for users who + * specify exactly 50% with two- or three-way replication. That means + * either a more-than check against half the total replicas or an + * at-least check against half of our peers (one less). Of the two, + * only an at-least check supports the intuitive use of 100% to mean + * all replicas must be present, because "more than 100%" will never + * succeed regardless of which count we use. This leaves us with a + * slightly non-traditional definition of quorum ("at least X% of peers + * not including ourselves") but one that's useful enough to be worth + * it. + * + * Note that n_children and up_children *do* include the local + * subvolume, so we need to subtract one in each case. + */ + must_be_up = ((double)(priv->n_children - 1)) * priv->quorum_pct; + are_up = ((double)(priv->up_children - 1)) * 100.0; + if (are_up < must_be_up) { + /* Emulate the AFR client-side-quorum behavior. */ + op_errno = EROFS; + goto err; + } local = mem_get0(this->local_pool); if (!local) { diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h index fc612c136..72b61bfa5 100644 --- a/xlators/cluster/nsr-server/src/nsr-internal.h +++ b/xlators/cluster/nsr-server/src/nsr-internal.h @@ -59,6 +59,7 @@ typedef struct { volatile uint32_t ops_in_flight; uint32_t index; gf_lock_t index_lock; + double quorum_pct; } nsr_private_t; typedef struct { diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c index eda9e555a..85eba09b5 100644 --- a/xlators/cluster/nsr-server/src/nsr.c +++ b/xlators/cluster/nsr-server/src/nsr.c @@ -591,6 +591,9 @@ nsr_init (xlator_t *this) goto err; } + + GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); + GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err); gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid); if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) { @@ -800,5 +803,10 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_STR, .description = "UUID for this NSR (sub)volume" }, + { .key = {"quorum-percent"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "50.0", + .description = "percentage of rep_count-1 that must be up" + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c index 24f7cf2de..1c50de234 100644 --- a/xlators/cluster/nsr-server/src/recon_notify.c +++ b/xlators/cluster/nsr-server/src/recon_notify.c @@ -91,8 +91,14 @@ nsr_recon_set_leader (xlator_t *this) if (ctx->last_reconciled_term == priv->current_term) return; - // No majority as of yet - if (priv->up_children <= (priv->n_children / 2)) + /* + * Quorum for reconciliation is not the same as quorum for I/O. Here, + * we require a true majority. The +1 is because we don't count + * ourselves as part of n_children or up_children. + * + * TBD: re-evaluate when to reconcile (including partial) + */ + if (priv->up_children <= (priv->n_children / 2)) return; gf_log (this->name, GF_LOG_INFO, diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 39bbe0a13..24a6ed7cd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -889,6 +889,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .description = "enable NSR reconciliation", .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT }, + { .key = "cluster.nsr.quorum-percent", + .voltype = "cluster/nsr", + .option = "quorum-percent", + .op_version = 3, + .description = "percent of rep_count-1 bricks that must be up" + }, /* Performance xlators enable/disbable options */ { .key = "performance.write-behind", -- cgit