summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2015-04-15 22:22:08 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-05 00:06:34 -0700
commit25e8e74eb7b81ccd114a9833371a3f72d284c48d (patch)
tree0c82b2ad9133c55c74852ce65d027bc36ed6cac5
parent6d7428d2018c061ca2791443bd90980f9755ded3 (diff)
afr: add arbitration support
Add logic in afr to work in conjunction with the arbiter xlator when a replica 3 arbiter volume is created. More specifically, this patch: * Enables full locks for afr data transaction for such volumes. * Removes the upfront marking of pending xattrs at the time of pre-op and defer it to post-op. (This is an arbiter independent change and is made for all afr transactions.) * After pre-op stage, check if we can proceed with the fop stage without ending up in split-brain by examining the changelog xattrs. * Unwinds the fop with failure if only one source was available at the time of pre-op and the fop happened to fail on particular source brick. * Skips data self-heal if arbiter brick is the only source available. * Adds the arbiter-count option to the shd graph. This patch is a part of the arbiter logic implementation for 3 way AFR details of which can be found at http://review.gluster.org/#/c/9656/ Change-Id: I9603db9d04de5626eb2f4d8d959ef5b46113561d BUG: 1199985 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/10258 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rw-r--r--tests/basic/afr/arbiter.t64
-rw-r--r--xlators/cluster/afr/src/afr-common.c25
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c2
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c12
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c6
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h3
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c183
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h2
-rw-r--r--xlators/cluster/afr/src/afr.h6
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c16
10 files changed, 272 insertions, 47 deletions
diff --git a/tests/basic/afr/arbiter.t b/tests/basic/afr/arbiter.t
new file mode 100644
index 00000000000..a9d485cd7b4
--- /dev/null
+++ b/tests/basic/afr/arbiter.t
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../afr.rc
+cleanup;
+
+TEST glusterd;
+TEST pidof glusterd
+
+# Non arbiter replica 3 volumes should not have arbiter-count option enabled.
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
+TEST ! stat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count
+TEST umount $M0
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+# Create and mount a replica 3 arbiter volume.
+TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
+TEST stat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count
+EXPECT "1" cat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count
+
+# Write data and metadata
+TEST `echo hello >> $M0/file`
+TEST setfattr -n user.name -v value1 $M0/file
+
+# Data I/O will fail if arbiter is the only source.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST `echo "B0 is down, B1 and B2 are sources" >> $M0/file`
+TEST setfattr -n user.name -v value2 $M0/file
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST `echo "B2 is down, B3 is the only source, writes will fail" >> $M0/file`
+TEST ! cat $M0/file
+# Metadata I/O should still succeed.
+TEST getfattr -n user.name $M0/file
+TEST setfattr -n user.name -v value3 $M0/file
+
+#shd should not data self-heal from arbiter to the sinks.
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"1")
+EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"2")
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST $CLI volume heal $V0
+EXPECT 0 afr_get_pending_heal_count $V0
+
+# I/O can resume again.
+TEST cat $M0/file
+TEST getfattr -n user.name $M0/file
+TEST `echo append>> $M0/file`
+TEST umount $M0
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 8993b164b91..8fbca0b6f42 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -993,6 +993,17 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
afr_entry_lockee_cleanup (&local->internal_lock);
GF_FREE (local->transaction.pre_op);
+
+ GF_FREE (local->transaction.pre_op_sources);
+ if (local->transaction.pre_op_xdata) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op_xdata[i])
+ continue;
+ dict_unref (local->transaction.pre_op_xdata[i]);
+ }
+ GF_FREE (local->transaction.pre_op_xdata);
+ }
+
GF_FREE (local->transaction.eager_lock);
GF_FREE (local->transaction.fop_subvols);
GF_FREE (local->transaction.failed_subvols);
@@ -4055,6 +4066,20 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (!local->transaction.pre_op)
goto out;
+ if (priv->arbiter_count == 1) {
+ local->transaction.pre_op_xdata =
+ GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
+ priv->child_count, gf_afr_mt_dict_t);
+ if (!local->transaction.pre_op_xdata)
+ goto out;
+
+ local->transaction.pre_op_sources =
+ GF_CALLOC (sizeof (*local->transaction.pre_op_sources),
+ priv->child_count, gf_afr_mt_char);
+ if (!local->transaction.pre_op_sources)
+ goto out;
+ }
+
local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols),
priv->child_count,
gf_afr_mt_char);
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index f7ca9108092..8a2c0e46e40 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -168,6 +168,8 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
local->replies[i].postparent2;
}
}
+
+ afr_txn_arbitrate_fop_cbk (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index f712166e44d..f9fde44e9e4 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -37,7 +37,7 @@
#include "protocol-common.h"
#include "afr-transaction.h"
-
+#include "afr-self-heal.h"
static void
__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
@@ -97,6 +97,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
}
}
}
+
+ afr_txn_arbitrate_fop_cbk (frame, this);
}
@@ -342,6 +344,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
{
call_frame_t *transaction_frame = NULL;
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
int ret = -1;
int op_errno = ENOMEM;
@@ -350,6 +353,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
goto out;
local = frame->local;
+ priv = this->private;
transaction_frame->local = local;
frame->local = NULL;
@@ -379,6 +383,12 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
local->transaction.start = local->cont.writev.offset;
local->transaction.len = iov_length (local->cont.writev.vector,
local->cont.writev.count);
+
+ /*Lock entire file to avoid network split brains.*/
+ if (priv->arbiter_count == 1) {
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ }
}
ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index a8a7326e4ec..7567fe9f851 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -684,6 +684,12 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
source = ret;
+ if (priv->arbiter_count == 1 && source == ARBITER_BRICK_INDEX &&
+ AFR_COUNT (sources, priv->child_count) == 1) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
locked_replies,
locked_replies[source].poststat.ia_size);
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 32be2480234..956f075e25b 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -141,6 +141,9 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
afr_transaction_type type,
unsigned char *locked_on, unsigned char *sources,
unsigned char *sinks, uint64_t *witness);
+int
+afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata);
int
afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index d2111060035..a2023884465 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -15,7 +15,7 @@
#include "afr.h"
#include "afr-transaction.h"
-
+#include "afr-self-heal.h"
#include <signal.h>
gf_boolean_t
@@ -139,14 +139,130 @@ __mark_all_success (call_frame_t *frame, xlator_t *this)
}
}
+void
+afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_transaction_type type = -1;
+ dict_t *xdata = NULL;
+ int **matrix = NULL;
+ int idx = -1;
+ int i = 0;
+ int j = 0;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+ idx = afr_index_for_transaction_type (type);
+ matrix = ALLOC_MATRIX (priv->child_count, int);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op_xdata[i])
+ continue;
+ xdata = local->transaction.pre_op_xdata[i];
+ afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
+ }
+
+ memset (local->transaction.pre_op_sources, 1, priv->child_count);
+
+ /*If lock or pre-op failed on a brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->transaction.pre_op_sources[i] = 0;
+ }
+
+ /* If brick is blamed by others, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j] != 0)
+ local->transaction.pre_op_sources[j] = 0;
+
+ /*We don't need the xattrs any more. */
+ for (i = 0; i < priv->child_count; i++)
+ if (local->transaction.pre_op_xdata[i]) {
+ dict_unref (local->transaction.pre_op_xdata[i]);
+ local->transaction.pre_op_xdata[i] = NULL;
+ }
+}
+
+void
+afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t fop_failed = _gf_false;
+ unsigned char *pre_op_sources = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ pre_op_sources = local->transaction.pre_op_sources;
+
+ if (priv->arbiter_count != 1 || local->op_ret < 0)
+ return;
+
+ /* If the fop failed on the brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ if (local->transaction.failed_subvols[i])
+ pre_op_sources[i] = 0;
+
+ switch (AFR_COUNT (pre_op_sources, priv->child_count)) {
+ case 1:
+ if (pre_op_sources[ARBITER_BRICK_INDEX])
+ fop_failed = _gf_true;
+ break;
+ case 0:
+ fop_failed = _gf_true;
+ break;
+ }
+
+ if (fop_failed) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ }
+
+ return;
+}
+
+void
+afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_sources_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_compute_pre_op_sources (frame, this);
+ pre_op_sources_count = AFR_COUNT (local->transaction.pre_op_sources,
+ priv->child_count);
+
+ /* If arbiter is the only source, do not proceed. */
+ if (pre_op_sources_count < 2 &&
+ local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+ local->internal_lock.lock_cbk = local->transaction.done;
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ afr_restore_lk_owner (frame);
+ afr_unlock (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
+
+ return;
+}
int
afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
fd_t *fd = NULL;
local = frame->local;
+ priv = this->private;
fd = local->fd;
/* Perform fops with the lk-owner from top xlator.
@@ -172,12 +288,15 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
*/
if (fd)
afr_delayed_changelog_wake_up (this, fd);
- local->transaction.fop (frame, this);
+ if (priv->arbiter_count == 1) {
+ afr_txn_arbitrate_fop (frame, this);
+ } else {
+ local->transaction.fop (frame, this);
+ }
return 0;
}
-
static int
__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
{
@@ -372,11 +491,16 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ int pre_op_count = 0;
int i = 0;
local = frame->local;
priv = this->private;
+ pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ if (pre_op_count < priv->child_count)
+ return _gf_false;
+
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.failed_subvols[i])
return _gf_false;
@@ -591,9 +715,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
else
need_undirty = _gf_true;
- //If the fop fails on all the subvols then pending markers are placed
- //for every subvol on all subvolumes. Which is nothing but split-brain.
- //Avoid this by not doing post-op in case of failures.
if (local->op_ret < 0) {
afr_changelog_post_op_done (frame, this);
goto out;
@@ -846,12 +967,22 @@ afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
int call_count = -1;
+ int child_index = -1;
local = frame->local;
+ priv = this->private;
+ child_index = (long) cookie;
if (op_ret == -1)
- afr_transaction_fop_failed (frame, this, (long) cookie);
+ afr_transaction_fop_failed (frame, this, child_index);
+
+ if (priv->arbiter_count == 1 && !op_ret) {
+ if (xattr)
+ local->transaction.pre_op_xdata[child_index] =
+ dict_ref (xattr);
+ }
call_count = afr_frame_return (frame);
@@ -964,7 +1095,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
unsigned char *locked_nodes = NULL;
- unsigned char *pending_subvols = NULL;
int idx = -1;
gf_boolean_t pre_nop = _gf_true;
dict_t *xdata_req = NULL;
@@ -975,15 +1105,13 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- pending_subvols = alloca0 (priv->child_count);
-
for (i = 0; i < priv->child_count; i++) {
if (locked_nodes[i]) {
local->transaction.pre_op[i] = 1;
call_count++;
} else {
- pending_subvols[i] = 1;
- }
+ local->transaction.failed_subvols[i] = 1;
+ }
}
/* This condition should not be met with present code, as
@@ -1009,28 +1137,21 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
goto err;
}
- pre_nop = _gf_true;
-
if (afr_changelog_pre_op_inherit (frame, this))
goto next;
- if (call_count < priv->child_count) {
- /* For subvols we are not performing operation on,
- mark them as pending up-front along with the FOP
- so that we can safely defer unmarking dirty until
- later.
- */
- for (i = 0; i < priv->child_count; i++) {
- if (pending_subvols[i])
- local->pending[i][idx] = hton32(1);
- }
- ret = afr_set_pending_dict (priv, xdata_req,
- local->pending);
- if (ret < 0) {
- op_errno = ENOMEM;
- goto err;
- }
- pre_nop = _gf_false;
+ if (call_count < priv->child_count)
+ pre_nop = _gf_false;
+
+ /* Set an all-zero pending changelog so that in the cbk, we can get the
+ * current on-disk values. In a replica 3 volume with arbiter enabled,
+ * these values are needed to arrive at a go/ no-go of the fop phase to
+ * avoid ending up in split-brain.*/
+
+ ret = afr_set_pending_dict (priv, xdata_req, local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
}
if (call_count > 1 &&
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index c3ce333b771..47d43d88991 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -16,6 +16,8 @@
void
afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int child_index);
+void
+afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this);
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f7bc6ea0f94..6cb708ffbd7 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -39,6 +39,8 @@
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define ARBITER_BRICK_INDEX 2
+
typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
@@ -656,6 +658,10 @@ typedef struct _afr_local {
unsigned char *pre_op;
+ /* For arbiter configuration only. */
+ dict_t **pre_op_xdata;
+ unsigned char *pre_op_sources;
+
/* @fop_subvols: subvolumes on which FOP will be attempted */
unsigned char *fop_subvols;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index fe281dd33db..a56d6d5ccca 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -4060,19 +4060,6 @@ out:
}
static int
-volgen_graph_build_replicate_clusters (volgen_graph_t *graph,
- glusterd_volinfo_t *volinfo)
-{
- char *replicate_args[] = {"cluster/replicate",
- "%s-replicate-%d"};
-
- return volgen_link_bricks_from_list_tail (graph, volinfo, "cluster/replicate",
- "%s-replicate-%d",
- volinfo->brick_count,
- volinfo->replica_count);
-}
-
-static int
build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict)
{
@@ -4086,8 +4073,7 @@ build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
switch (volinfo->type) {
case GF_CLUSTER_TYPE_REPLICATE:
case GF_CLUSTER_TYPE_STRIPE_REPLICATE:
- clusters = volgen_graph_build_replicate_clusters (graph,
- volinfo);
+ clusters = volgen_graph_build_afr_clusters (graph, volinfo);
break;
case GF_CLUSTER_TYPE_DISPERSE: