summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2017-08-16 18:01:17 +0530
committerShyamsundar Ranganathan <srangana@redhat.com>2017-11-21 18:00:33 +0000
commitfa13ff2f94e16532014f5a6744f494478870e47a (patch)
tree5c2dfe3d86cafa7d24e389d17a818938fc9fed2e
parentf9bfda413437bdf32b48f5c2cef1887b1949bb84 (diff)
afr: add checks for allowing lookups
Problem: In an arbiter volume, lookup was being served from one of the sink bricks (source brick was down). shard uses the iatt values from lookup cbk to calculate the size and block count, which in this case were incorrect values. shard_local_t->last_block was thus initialised to -1, resulting in an infinite while loop in shard_common_resolve_shards(). Fix: Use client quorum logic to allow or fail the lookups from afr if there are no readable subvolumes. So in replica-3 or arbiter vols, if there is no good copy or if quorum is not met, fail lookup with ENOTCONN. With this fix, we are also removing support for quorum-reads xlator option. So if quorum is not met, neither read nor write txns are allowed and we fail the fop with ENOTCONN. Change-Id: Ic65c00c24f77ece007328b421494eee62a505fa0 BUG: 1515572 Signed-off-by: Ravishankar N <ravishankar@redhat.com> (cherry picked from commit bd44d59741bb8c0f5d7a62c5b1094179dd0ce8a4)
-rw-r--r--tests/basic/afr/quorum.t23
-rwxr-xr-xtests/bugs/replicate/bug-977797.t2
-rw-r--r--xlators/cluster/afr/src/afr-common.c244
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c3
-rw-r--r--xlators/cluster/afr/src/afr.c7
-rw-r--r--xlators/cluster/afr/src/afr.h1
6 files changed, 164 insertions, 116 deletions
diff --git a/tests/basic/afr/quorum.t b/tests/basic/afr/quorum.t
index 252e25468d7..58116ba49f5 100644
--- a/tests/basic/afr/quorum.t
+++ b/tests/basic/afr/quorum.t
@@ -31,11 +31,7 @@ TEST $CLI volume set $V0 cluster.quorum-count 2
TEST test_write
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST ! test_write
-EXPECT "abc" cat $M0/b
-TEST $CLI volume set $V0 cluster.quorum-reads on
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
TEST ! cat $M0/b
-TEST $CLI volume reset $V0 cluster.quorum-reads
TEST $CLI volume set $V0 cluster.quorum-type auto
EXPECT auto volume_option $V0 cluster.quorum-type
@@ -44,11 +40,7 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
TEST test_write
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST ! test_write
-EXPECT "abc" cat $M0/b
-TEST $CLI volume set $V0 cluster.quorum-reads on
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
TEST ! cat $M0/b
-TEST $CLI volume reset $V0 cluster.quorum-reads
TEST $CLI volume set $V0 cluster.quorum-type none
EXPECT none volume_option $V0 cluster.quorum-type
@@ -57,11 +49,6 @@ TEST test_write
TEST $CLI volume reset $V0 cluster.quorum-type
TEST test_write
EXPECT "abc" cat $M0/b
-TEST $CLI volume set $V0 cluster.quorum-reads on
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
-EXPECT "abc" cat $M0/b
-TEST $CLI volume reset $V0 cluster.quorum-reads
-
cleanup;
TEST glusterd;
@@ -86,24 +73,14 @@ TEST $CLI volume set $V0 cluster.quorum-count 3
TEST test_write
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST ! test_write
-EXPECT "abc" cat $M0/b
-TEST $CLI volume set $V0 cluster.quorum-reads on
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
TEST ! cat $M0/b
-TEST $CLI volume reset $V0 cluster.quorum-reads
-
TEST $CLI volume set $V0 cluster.quorum-type auto
EXPECT auto volume_option $V0 cluster.quorum-type
TEST test_write
TEST kill_brick $V0 $H0 $B0/${V0}3
TEST ! test_write
-EXPECT "abc" cat $M0/b
-TEST $CLI volume set $V0 cluster.quorum-reads on
-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads
TEST ! cat $M0/b
-TEST $CLI volume reset $V0 cluster.quorum-reads
-
TEST $CLI volume set $V0 cluster.quorum-type none
EXPECT none volume_option $V0 cluster.quorum-type
diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t
index ea9a98adc23..fee82054cc3 100755
--- a/tests/bugs/replicate/bug-977797.t
+++ b/tests/bugs/replicate/bug-977797.t
@@ -53,6 +53,8 @@ TEST chmod 757 $M0/a/file
TEST $CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1;
+#Trigger entry heal of $M0/a
+getfattr -n user.nosuchattr $M0/a
dd if=$M0/a/file of=/dev/null bs=1024k
#read fails, but heal is triggered.
TEST [ $? -ne 0 ]
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 5e336c2a74c..e0468a4006c 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -48,9 +48,7 @@
int32_t
afr_quorum_errno (afr_private_t *priv)
{
- if (priv->quorum_reads)
- return ENOTCONN;
- return EROFS;
+ return ENOTCONN;
}
int
@@ -1092,8 +1090,6 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
return ret;
}
-
-
int
afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
{
@@ -1664,6 +1660,29 @@ afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
return ret;
}
+void
+afr_readables_intersect_get (inode_t *inode, xlator_t *this, int *event,
+ unsigned char *intersection)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *intersect = NULL;
+
+ priv = this->private;
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+ intersect = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_get (inode, this, data_readable,
+ metadata_readable, event);
+
+ AFR_INTERSECT (intersect, data_readable, metadata_readable,
+ priv->child_count);
+ if (intersection)
+ memcpy (intersection, intersect,
+ sizeof (*intersection) * priv->child_count);
+}
int
afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
@@ -1672,8 +1691,6 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
afr_read_subvol_args_t *args)
{
afr_private_t *priv = NULL;
- unsigned char *data_readable = NULL;
- unsigned char *metadata_readable = NULL;
unsigned char *readable = NULL;
unsigned char *intersection = NULL;
int subvol = -1;
@@ -1682,17 +1699,11 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
priv = this->private;
readable = alloca0 (priv->child_count);
- data_readable = alloca0 (priv->child_count);
- metadata_readable = alloca0 (priv->child_count);
intersection = alloca0 (priv->child_count);
afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
- afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
- &event);
-
- AFR_INTERSECT (intersection, data_readable, metadata_readable,
- priv->child_count);
+ afr_readables_intersect_get (inode, this, &event, intersection);
if (AFR_COUNT (intersection, priv->child_count) > 0)
subvol = afr_read_subvol_select_by_policy (inode, this,
@@ -2145,18 +2156,28 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
int
afr_read_subvol_decide (inode_t *inode, xlator_t *this,
- afr_read_subvol_args_t *args)
+ afr_read_subvol_args_t *args, unsigned char *readable)
{
- int data_subvol = -1;
- int mdata_subvol = -1;
+ int event = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *intersection = NULL;
- data_subvol = afr_data_subvol_get (inode, this, NULL, NULL, NULL, args);
- mdata_subvol = afr_metadata_subvol_get (inode, this,
- NULL, NULL, NULL, args);
- if (data_subvol == -1 || mdata_subvol == -1)
+ priv = this->private;
+ intersection = alloca0 (priv->child_count);
+
+ afr_readables_intersect_get (inode, this, &event, intersection);
+
+ if (AFR_COUNT (intersection, priv->child_count) <= 0) {
+ /* TODO: If we have one brick with valid data_readable and
+ * another with metadata_readable, try to send an iatt with
+ * valid bits from both.*/
return -1;
+ }
- return data_subvol;
+ memcpy (readable, intersection, sizeof (*readable) * priv->child_count);
+
+ return afr_read_subvol_select_by_policy (inode, this, intersection,
+ args);
}
static inline int
@@ -2173,7 +2194,49 @@ afr_first_up_child (call_frame_t *frame, xlator_t *this)
if (local->replies[i].valid &&
local->replies[i].op_ret == 0)
return i;
- return 0;
+ return -1;
+}
+
+static void
+afr_attempt_readsubvol_set (call_frame_t *frame, xlator_t *this,
+ unsigned char *success_replies,
+ unsigned char *data_readable, int *read_subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_choice = -1;
+ int child_count = -1;
+
+ if (*read_subvol != -1)
+ return;
+
+ priv = this->private;
+ local = frame->local;
+ child_count = priv->child_count;
+
+ afr_inode_split_brain_choice_get (local->inode, this,
+ &spb_choice);
+ if ((spb_choice >= 0) &&
+ (AFR_COUNT(success_replies, child_count) == child_count)) {
+ *read_subvol = spb_choice;
+ } else if (!priv->quorum_count) {
+ *read_subvol = afr_first_up_child (frame, this);
+ } else if (priv->quorum_count &&
+ afr_has_quorum (data_readable, this)) {
+ /* read_subvol is guaranteed to be valid if we hit this path. */
+ *read_subvol = afr_first_up_child (frame, this);
+ } else {
+ /* If quorum is enabled and we do not have a
+ readable yet, it means all good copies are down.
+ */
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_READ_SUBVOL_ERROR, "no read "
+ "subvols for %s", local->loc.path);
+ }
+ if (*read_subvol >= 0)
+ dict_del (local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
}
static void
@@ -2187,13 +2250,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
int par_read_subvol = 0;
int ret = -1;
unsigned char *readable = NULL;
+ unsigned char *success_replies = NULL;
int event = 0;
struct afr_reply *replies = NULL;
uuid_t read_gfid = {0, };
gf_boolean_t locked_entry = _gf_false;
gf_boolean_t can_interpret = _gf_true;
inode_t *parent = NULL;
- int spb_choice = -1;
ia_type_t ia_type = IA_INVAL;
afr_read_subvol_args_t args = {0,};
char *gfid_heal_msg = NULL;
@@ -2207,11 +2270,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
this);
readable = alloca0 (priv->child_count);
+ success_replies = alloca0 (priv->child_count);
afr_inode_read_subvol_get (parent, this, readable, NULL, &event);
+ par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
+ readable);
- afr_inode_split_brain_choice_get (local->inode, this,
- &spb_choice);
/* First, check if we have a gfid-change from somewhere,
If so, propagate that so that a fresh lookup can be
issued
@@ -2219,13 +2283,17 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
if (local->cont.lookup.needs_fresh_lookup) {
local->op_ret = -1;
local->op_errno = ESTALE;
- goto unwind;
+ goto error;
}
op_errno = afr_final_errno (frame->local, this->private);
local->op_errno = op_errno;
read_subvol = -1;
+ for (i = 0; i < priv->child_count; i++)
+ if (replies[i].valid && replies[i].op_ret == 0)
+ success_replies[i] = 1;
+
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
@@ -2234,9 +2302,9 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
replies[i].op_errno == ENOENT) {
/* Second, check entry is still
"underway" in creation */
- local->op_ret = -1;
- local->op_errno = ENOENT;
- goto unwind;
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
}
if (replies[i].op_ret == -1)
@@ -2250,8 +2318,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
}
}
- if (read_subvol == -1)
- goto unwind;
+ if (read_subvol == -1)
+ goto error;
/* We now have a read_subvol, which is readable[] (if there
were any). Next we look for GFID mismatches. We don't
consider a GFID mismatch as an error if read_subvol is
@@ -2277,58 +2345,61 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
if (readable[read_subvol] && !readable[i])
continue;
+ /* If we were called from glfsheal and there is still a gfid
+ * mismatch, succeed the lookup and let glfsheal print the
+ * response via gfid-heal-msg.*/
+ if (!dict_get_str (local->xattr_req, "gfid-heal-msg",
+ &gfid_heal_msg))
+ goto cant_interpret;
+
/* LOG ERROR */
local->op_ret = -1;
local->op_errno = EIO;
- goto unwind;
+ goto error;
}
/* Forth, for the finalized GFID, pick the best subvolume
to return stats from.
*/
+ read_subvol = -1;
+ memset (readable, 0, sizeof (*readable) * priv->child_count);
if (can_interpret) {
+ if (!afr_has_quorum (success_replies, this))
+ goto cant_interpret;
/* It is safe to call afr_replies_interpret() because we have
a response from all the UP subvolumes and all of them resolved
to the same GFID
*/
gf_uuid_copy (args.gfid, read_gfid);
args.ia_type = ia_type;
- if (afr_replies_interpret (frame, this, local->inode, NULL)) {
- read_subvol = afr_read_subvol_decide (local->inode,
- this, &args);
+ ret = afr_replies_interpret (frame, this, local->inode, NULL);
+ read_subvol = afr_read_subvol_decide (local->inode, this, &args,
+ readable);
+ if (read_subvol == -1)
+ goto cant_interpret;
+ if (ret) {
afr_inode_event_gen_reset (local->inode, this);
- goto cant_interpret;
- } else {
- read_subvol = afr_data_subvol_get (local->inode, this,
- NULL, NULL, NULL, &args);
- }
+ dict_del (local->replies[read_subvol].xdata,
+ GF_CONTENT_KEY);
+ }
} else {
cant_interpret:
+ afr_attempt_readsubvol_set (frame, this, success_replies,
+ readable, &read_subvol);
if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
+ goto error;
}
- dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
}
afr_handle_quota_size (frame, this);
-unwind:
afr_set_need_heal (this, local);
- if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
-
- }
- par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
- readable);
if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0, "Arbiter cannot be a read subvol "
+ "for %s", local->loc.path);
+ goto error;
}
ret = dict_get_str (local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
@@ -2348,6 +2419,11 @@ unwind:
local->inode, &local->replies[read_subvol].poststat,
local->replies[read_subvol].xdata,
&local->replies[par_read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, NULL,
+ NULL, NULL, NULL);
}
/*
@@ -2765,55 +2841,54 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
int i = -1;
int op_errno = 0;
- int spb_choice = -1;
int read_subvol = -1;
+ unsigned char *data_readable = NULL;
+ unsigned char *success_replies = NULL;
priv = this->private;
local = frame->local;
-
- afr_inode_split_brain_choice_get (local->inode, this,
- &spb_choice);
+ data_readable = alloca0 (priv->child_count);
+ success_replies = alloca0 (priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (!local->replies[i].valid)
continue;
- if (local->replies[i].op_ret == 0)
+ if (local->replies[i].op_ret == 0) {
+ success_replies[i] = 1;
local->op_ret = 0;
+ }
}
op_errno = afr_final_errno (frame->local, this->private);
if (local->op_ret < 0) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- goto unwind;
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL,
+ NULL, NULL);
+ return;
}
- afr_replies_interpret (frame, this, local->inode, NULL);
+ if (!afr_has_quorum (success_replies, this))
+ goto unwind;
- read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
- if (read_subvol == -1) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",
- local->loc.path);
+ afr_replies_interpret (frame, this, local->inode, NULL);
- if (spb_choice >= 0) {
- read_subvol = spb_choice;
- } else {
- read_subvol = afr_first_up_child (frame, this);
- }
- }
+ read_subvol = afr_read_subvol_decide (local->inode, this, NULL,
+ data_readable);
unwind:
+ afr_attempt_readsubvol_set (frame, this, success_replies, data_readable,
+ &read_subvol);
if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ NULL, NULL, NULL, NULL);
+ return;
}
+
if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug (this->name, 0, "Arbiter cannot be a read subvol "
+ "for %s", local->loc.path);
}
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
@@ -4673,7 +4748,6 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write("read_child", "%d", priv->read_child);
gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
gf_proc_dump_write("wait_count", "%u", priv->wait_count);
- gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);
gf_proc_dump_write("heal-wait-queue-length", "%d",
priv->heal_wait_qlen);
gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 50e8040d33e..f6c491b713e 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -193,8 +193,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->inode = inode_ref (inode);
local->is_read_txn = _gf_true;
- if (priv->quorum_reads &&
- priv->quorum_count && !afr_has_quorum (priv->child_up, this)) {
+ if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
local->op_ret = -1;
local->op_errno = ENOTCONN;
read_subvol = -1;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 84dbcc04680..b18f60fcc7c 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -278,8 +278,6 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,
int32, out);
- GF_OPTION_RECONF ("quorum-reads", priv->quorum_reads, options,
- bool, out);
GF_OPTION_RECONF ("consistent-metadata", priv->consistent_metadata,
options, bool, out);
@@ -554,7 +552,6 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
- GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out);
GF_OPTION_INIT ("consistent-metadata", priv->consistent_metadata, bool,
out);
GF_OPTION_INIT ("consistent-io", priv->consistent_io, bool, out);
@@ -998,8 +995,8 @@ struct volume_options options[] = {
{ .key = {"quorum-reads"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "no",
- .description = "If quorum-reads is \"true\" only allow reads if "
- "quorum is met when quorum is enabled.",
+ .description = "This option has been removed. Reads are not allowed "
+ "if quorum is not met.",
},
{ .key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 8cd687398f0..25625665c42 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -139,7 +139,6 @@ typedef struct _afr_private {
gf_boolean_t pre_op_compat; /* on/off */
uint32_t post_op_delay_secs;
unsigned int quorum_count;
- gf_boolean_t quorum_reads;
char vol_uuid[UUID_SIZE + 1];
int32_t *last_event;