summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2018-08-31 10:32:20 +0530
committerRavishankar N <ravishankar@redhat.com>2018-09-05 08:28:23 +0000
commit69532c141be160b3fea03c1579ae4ac13018dcdf (patch)
tree87c1db97307ce692343144d147eaa7cf3b0b5dfc
parent560bd8671f2f9ae17530ad1d23a209dc95d41f82 (diff)
afr: thin-arbiter read txn changes
If both data bricks are up, read subvol will be based on read_subvols. If only one data brick is up: - First qeury the data-brick that is up. If it blames the other brick, allow the reads. - If if doesn't, query the TA to obtain the source of truth. TODO: See if in-memory state can be maintained for read txns (BZ 1624358). updates: bz#1579788 Change-Id: I61eec35592af3a1aaf9f90846d9a358b2e4b2fcc Signed-off-by: Ravishankar N <ravishankar@redhat.com>
-rw-r--r--tests/basic/afr/ta-read.t60
-rw-r--r--tests/thin-arbiter.rc5
-rw-r--r--xlators/cluster/afr/src/afr-common.c22
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c249
-rw-r--r--xlators/cluster/afr/src/afr.h3
5 files changed, 320 insertions, 19 deletions
diff --git a/tests/basic/afr/ta-read.t b/tests/basic/afr/ta-read.t
new file mode 100644
index 00000000000..f2b3c38e06c
--- /dev/null
+++ b/tests/basic/afr/ta-read.t
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Test read transaction logic for thin-arbiter.
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../thin-arbiter.rc
+cleanup;
+TEST ta_create_brick_and_volfile brick0
+TEST ta_create_brick_and_volfile brick1
+TEST ta_create_ta_and_volfile ta
+TEST ta_start_brick_process brick0
+TEST ta_start_brick_process brick1
+TEST ta_start_ta_process ta
+
+TEST ta_create_mount_volfile brick0 brick1 ta
+TEST ta_start_mount_process $M0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta
+
+TEST touch $M0/FILE
+TEST ls $B0/brick0/FILE
+TEST ls $B0/brick1/FILE
+TEST ! ls $B0/ta/FILE
+
+# Kill one brick and write to FILE.
+TEST ta_kill_brick brick0
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0
+echo "brick0 down">> $M0/FILE
+TEST [ $? -eq 0 ]
+EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/FILE
+EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.patchy-ta-2
+
+#Umount and mount to remove cached data.
+TEST umount $M0
+TEST ta_start_mount_process $M0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
+# Read must be allowed since good brick is up.
+TEST cat $M0/FILE
+
+# Toggle good and bad data brick processes.
+TEST ta_start_brick_process brick0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
+TEST ta_kill_brick brick1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1
+# Read must now fail.
+TEST ! cat $M0/FILE
+
+# Bring all data bricks up, and kill TA.
+TEST ta_start_brick_process brick1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
+TA_PID=$(ta_get_pid_by_brick_name ta)
+TEST [ -n $TA_PID ]
+TEST ta_kill_brick ta
+TA_PID=$(ta_get_pid_by_brick_name ta)
+TEST [ -z $TA_PID ]
+# Read must now succeed.
+TEST cat $M0/FILE
+cleanup;
diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc
index 8bb888a7471..36d11cea61d 100644
--- a/tests/thin-arbiter.rc
+++ b/tests/thin-arbiter.rc
@@ -419,6 +419,11 @@ function ta_kill_brick()
kill -9 $p
}
+function ta_get_pid_by_brick_name()
+{
+ cat $B0/${1}.pid
+}
+
function ta_up_status()
{
local v=$1
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 702973a6ef4..641485b1ed0 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1310,6 +1310,14 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)
goto refresh_done;
}
+ if (priv->thin_arbiter_count && local->is_read_txn &&
+ AFR_COUNT (success_replies, priv->child_count) !=
+ priv->child_count) {
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ error = EINVAL;
+ goto refresh_done;
+ }
+
ret = afr_replies_interpret (frame, this, local->refreshinode,
&start_heal);
@@ -6977,3 +6985,17 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc)
out:
return ret;
}
+
+call_frame_t *
+afr_ta_frame_create (xlator_t *this)
+{
+ call_frame_t *frame = NULL;
+ void *lk_owner = NULL;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ return NULL;
+ lk_owner = (void *)this;
+ afr_set_lk_owner (frame, this, lk_owner);
+ return frame;
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index a66a6660b0d..945c050da03 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index)
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
+static gf_boolean_t
+afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv,
+ int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
+ }
+ }
+
+ return _gf_false;
+}
+
void
afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
@@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
return 0;
}
+static int
+afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque)
+{
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
+
+static int
+afr_ta_read_txn (void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ int read_subvol = -1;
+ int up_child = AFR_CHILD_UNKNOWN;
+ int possible_bad_child = AFR_CHILD_UNKNOWN;
+ int ret = 0;
+ int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {0, };
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int **pending = NULL;
+ loc_t loc = {0,};
+
+ frame = (call_frame_t *)opaque;
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+
+ if (local->child_up[AFR_CHILD_ZERO]) {
+ up_child = AFR_CHILD_ZERO;
+ possible_bad_child = AFR_CHILD_ONE;
+ } else if (local->child_up[AFR_CHILD_ONE]) {
+ up_child = AFR_CHILD_ONE;
+ possible_bad_child = AFR_CHILD_ZERO;
+ }
+
+ GF_ASSERT (up_child != AFR_CHILD_UNKNOWN);
+
+ /* Query the up_child to see if it blames the down one. */
+ xdata_req = dict_new();
+ if (!xdata_req)
+ goto out;
+
+ pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!pending)
+ goto out;
+
+ ret = afr_set_pending_dict (priv, xdata_req, pending);
+ if (ret < 0)
+ goto out;
+
+ if (local->fd) {
+ ret = syncop_fxattrop (priv->children[up_child], local->fd,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
+ &xdata_rsp, NULL);
+ } else {
+ ret = syncop_xattrop (priv->children[up_child], &local->loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
+ &xdata_rsp, NULL);
+ }
+ if (ret || !xdata_rsp) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed xattrop for gfid %s on %s",
+ uuid_utoa (local->inode->gfid),
+ priv->children[up_child]->name);
+ op_errno = -ret;
+ goto out;
+ }
+
+ if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv,
+ possible_bad_child)) {
+ read_subvol = up_child;
+ goto out;
+ }
+ dict_unref (xdata_rsp);
+ /* Query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc (this, &loc);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.",
+ loc.name);
+ goto out;
+ }
+ flock.l_type = F_WRLCK;/*start and length are already zero. */
+ ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock,
+ NULL, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
+ uuid_utoa (local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto out;
+ }
+
+ ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ if (ret || !xdata_rsp) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed xattrop on %s.",
+ uuid_utoa (local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto unlock;
+ }
+
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) {
+ read_subvol = up_child;
+ } else {
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Failing read for gfid %s since good brick %s is down",
+ uuid_utoa (local->inode->gfid),
+ priv->children[possible_bad_child]->name);
+ op_errno = EIO;
+ }
+
+unlock:
+ flock.l_type = F_UNLCK;
+ ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock,
+ NULL, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
+ "%s.", uuid_utoa (local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ }
+out:
+ if (xdata_req)
+ dict_unref(xdata_req);
+ if (xdata_rsp)
+ dict_unref(xdata_rsp);
+ if (pending)
+ afr_matrix_cleanup (pending, priv->child_count);
+ loc_wipe (&loc);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ afr_read_txn_wind (frame, this, read_subvol);
+ return ret;
+}
+
+void
+afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto out;
+ }
+ ret = synctask_new (this->ctx->env, afr_ta_read_txn,
+ afr_ta_read_txn_done, ta_frame, frame);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
+ AFR_MSG_THIN_ARB, "Failed to launch "
+ "afr_ta_read_txn synctask for gfid %s.",
+ uuid_utoa(local->inode->gfid));
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ STACK_DESTROY(ta_frame->root);
+ goto out;
+ }
+ return;
+out:
+ afr_read_txn_wind (frame, this, -1);
+}
+
int
afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t *local = NULL;
- int read_subvol = 0;
- inode_t *inode = NULL;
- int ret = -1;
- int spb_choice = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ inode_t *inode = NULL;
+ int ret = -1;
+ int spb_choice = -1;
local = frame->local;
inode = local->inode;
+ priv = this->private;
if (err) {
- read_subvol = -1;
- goto readfn;
+ if (!priv->thin_arbiter_count)
+ goto readfn;
+ if (err != EINVAL)
+ goto readfn;
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ afr_ta_read_txn_synctask (frame, this);
+ return 0;
}
read_subvol = afr_read_subvol_select_by_policy (inode, this,
@@ -127,7 +335,6 @@ readfn:
return 0;
}
-
int
afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
{
@@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
}
}
-
/*
afr_read_txn:
@@ -207,13 +413,13 @@ int
afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
afr_read_txn_wind_t readfn, afr_transaction_type type)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- unsigned char *data = NULL;
- unsigned char *metadata = NULL;
- int read_subvol = -1;
- int event_generation = 0;
- int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data = NULL;
+ unsigned char *metadata = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
priv = this->private;
local = frame->local;
@@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->readfn = readfn;
local->inode = inode_ref (inode);
local->is_read_txn = _gf_true;
+ local->transaction.type = type;
if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
local->op_ret = -1;
local->op_errno = afr_quorum_errno(priv);
- read_subvol = -1;
goto read;
}
if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {
local->op_ret = -1;
- read_subvol = -1;
goto read;
}
- local->transaction.type = type;
+ if (priv->thin_arbiter_count &&
+ AFR_COUNT (local->child_up, priv->child_count) !=
+ priv->child_count) {
+ afr_ta_read_txn_synctask (frame, this);
+ return 0;
+ }
+
ret = afr_inode_read_subvol_get (inode, this, data, metadata,
&event_generation);
if (ret == -1)
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 155dc1d96af..f7b636cb3d9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1252,4 +1252,7 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc);
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
+
+call_frame_t*
+afr_ta_frame_create (xlator_t *this);
#endif /* __AFR_H__ */