diff options
| -rw-r--r-- | tests/basic/afr/ta-read.t | 60 | ||||
| -rw-r--r-- | tests/thin-arbiter.rc | 5 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 22 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 249 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 3 | 
5 files changed, 320 insertions, 19 deletions
| diff --git a/tests/basic/afr/ta-read.t b/tests/basic/afr/ta-read.t new file mode 100644 index 00000000000..f2b3c38e06c --- /dev/null +++ b/tests/basic/afr/ta-read.t @@ -0,0 +1,60 @@ +#!/bin/bash + +# Test read transaction logic for thin-arbiter. + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../thin-arbiter.rc +cleanup; +TEST ta_create_brick_and_volfile brick0 +TEST ta_create_brick_and_volfile brick1 +TEST ta_create_ta_and_volfile ta +TEST ta_start_brick_process brick0 +TEST ta_start_brick_process brick1 +TEST ta_start_ta_process ta + +TEST ta_create_mount_volfile brick0 brick1 ta +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta + +TEST touch $M0/FILE +TEST ls $B0/brick0/FILE +TEST ls $B0/brick1/FILE +TEST ! ls $B0/ta/FILE + +# Kill one brick and write to FILE. +TEST ta_kill_brick brick0 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 +echo "brick0 down">> $M0/FILE +TEST [ $? -eq 0 ] +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/FILE +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.patchy-ta-2 + +#Umount and mount to remove cached data. +TEST umount $M0 +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 +# Read must be allowed since good brick is up. +TEST  cat $M0/FILE + +# Toggle good and bad data brick processes. +TEST ta_start_brick_process brick0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +TEST ta_kill_brick brick1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 +# Read must now fail. +TEST ! cat $M0/FILE + +# Bring all data bricks up, and kill TA. +TEST ta_start_brick_process brick1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 +TA_PID=$(ta_get_pid_by_brick_name ta) +TEST [ -n $TA_PID ] +TEST ta_kill_brick ta +TA_PID=$(ta_get_pid_by_brick_name ta) +TEST [ -z $TA_PID ] +# Read must now succeed. +TEST cat $M0/FILE +cleanup; diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc index 8bb888a7471..36d11cea61d 100644 --- a/tests/thin-arbiter.rc +++ b/tests/thin-arbiter.rc @@ -419,6 +419,11 @@ function ta_kill_brick()          kill -9 $p  } +function ta_get_pid_by_brick_name() +{ +        cat $B0/${1}.pid +} +  function ta_up_status()  {          local v=$1 diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 702973a6ef4..641485b1ed0 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1310,6 +1310,14 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)                  goto refresh_done;          } +        if (priv->thin_arbiter_count && local->is_read_txn && +            AFR_COUNT (success_replies, priv->child_count) != +                       priv->child_count) { +                /* We need to query the good bricks and/or thin-arbiter.*/ +                error = EINVAL; +                goto refresh_done; +        } +  	ret = afr_replies_interpret (frame, this, local->refreshinode,                                       &start_heal); @@ -6977,3 +6985,17 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc)  out:          return ret;  } + +call_frame_t * +afr_ta_frame_create (xlator_t *this) +{ +        call_frame_t *frame    = NULL; +        void         *lk_owner = NULL; + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) +                return NULL; +        lk_owner = (void *)this; +        afr_set_lk_owner (frame, this, lk_owner); +        return frame; +} diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index a66a6660b0d..945c050da03 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index)          GF_ATOMIC_DEC(priv->pending_reads[child_index]);  } +static gf_boolean_t +afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv, +                                    int child) +{ +        int     *pending                     = NULL; +        int     ret                          = 0; +        int     i                            = 0; + +        ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending); +        if (ret == 0) { +                for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +                        /* Not doing a ntoh32(pending) as we just want to check +                         * if it is non-zero or not. */ +                        if (pending[i]) { +                                return _gf_true; +                        } +                } +        } + +        return _gf_false; +} +  void  afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)  { @@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)  	return 0;  } +static int +afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque) +{ +        STACK_DESTROY(ta_frame->root); +        return 0; +} + +static int +afr_ta_read_txn (void *opaque) +{ +        call_frame_t    *frame             = NULL; +        xlator_t        *this              = NULL; +        int             read_subvol        = -1; +        int             up_child           = AFR_CHILD_UNKNOWN; +        int             possible_bad_child = AFR_CHILD_UNKNOWN; +        int             ret                = 0; +        int             op_errno           = ENOMEM; +        afr_local_t     *local             = NULL; +        afr_private_t   *priv              = NULL; +        struct gf_flock flock              = {0, }; +        dict_t          *xdata_req         = NULL; +        dict_t          *xdata_rsp         = NULL; +        int             **pending          = NULL; +        loc_t           loc                = {0,}; + +        frame = (call_frame_t *)opaque; +        this = frame->this; +        local = frame->local; +        priv = this->private; + +        if (local->child_up[AFR_CHILD_ZERO]) { +                up_child = AFR_CHILD_ZERO; +                possible_bad_child = AFR_CHILD_ONE; +        } else if (local->child_up[AFR_CHILD_ONE]) { +                up_child = AFR_CHILD_ONE; +                possible_bad_child = AFR_CHILD_ZERO; +        } + +        GF_ASSERT (up_child != AFR_CHILD_UNKNOWN); + +        /* Query the up_child to see if it blames the down one. */ +        xdata_req = dict_new(); +        if (!xdata_req) +                goto out; + +        pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); +        if (!pending) +                goto out; + +        ret = afr_set_pending_dict (priv, xdata_req, pending); +        if (ret < 0) +                goto out; + +        if (local->fd) { +                ret = syncop_fxattrop (priv->children[up_child], local->fd, +                                       GF_XATTROP_ADD_ARRAY, xdata_req, NULL, +                                       &xdata_rsp, NULL); +        } else { +               ret = syncop_xattrop (priv->children[up_child], &local->loc, +                                     GF_XATTROP_ADD_ARRAY, xdata_req, NULL, +                                     &xdata_rsp, NULL); +        } +        if (ret || !xdata_rsp) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "Failed xattrop for gfid %s on %s", +                        uuid_utoa (local->inode->gfid), +                        priv->children[up_child]->name); +                op_errno = -ret; +                goto out; +        } + +        if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv, +                                                possible_bad_child)) { +                read_subvol = up_child; +                goto out; +        } +        dict_unref (xdata_rsp); +        /* Query thin-arbiter to see if it blames any data brick. */ +        ret = afr_fill_ta_loc (this, &loc); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "Failed to populate thin-arbiter loc for: %s.", +                        loc.name); +                goto out; +        } +        flock.l_type = F_WRLCK;/*start and length are already zero. */ +        ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], +                              AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, +                              NULL, NULL); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", +                        uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +                op_errno = -ret; +                goto out; +        } + +        ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc, +                              GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, +                              NULL); +        if (ret || !xdata_rsp) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed xattrop on %s.", +                        uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +                op_errno = -ret; +                goto unlock; +        } + +        if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) { +                read_subvol = up_child; +        } else { +                gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, +                        "Failing read for gfid %s since good brick %s is down", +                        uuid_utoa (local->inode->gfid), +                        priv->children[possible_bad_child]->name); +                op_errno = EIO; +        } + +unlock: +        flock.l_type = F_UNLCK; +        ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], +                              AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, +                              NULL, NULL); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, +                        "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " +                        "%s.", uuid_utoa (local->inode->gfid), +                        priv->pending_key[THIN_ARBITER_BRICK_INDEX]); +        } +out: +        if (xdata_req) +                dict_unref(xdata_req); +        if (xdata_rsp) +                dict_unref(xdata_rsp); +        if (pending) +                afr_matrix_cleanup (pending, priv->child_count); +        loc_wipe (&loc); + +        if (read_subvol == -1) { +                local->op_ret = -1; +                local->op_errno = op_errno; +        } +        afr_read_txn_wind (frame, this, read_subvol); +        return ret; +} + +void +afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *ta_frame  = NULL; +        afr_local_t  *local     = NULL; +        int          ret        = 0; + +        local = frame->local; +        ta_frame = afr_ta_frame_create(this); +        if (!ta_frame) { +                local->op_ret = -1; +                local->op_errno = ENOMEM; +                goto out; +        } +        ret = synctask_new (this->ctx->env, afr_ta_read_txn, +                            afr_ta_read_txn_done, ta_frame, frame); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, ENOMEM, +                        AFR_MSG_THIN_ARB, "Failed to launch " +                        "afr_ta_read_txn synctask for gfid %s.", +                        uuid_utoa(local->inode->gfid)); +                local->op_ret = -1; +                local->op_errno = ENOMEM; +                STACK_DESTROY(ta_frame->root); +                goto out; +        } +        return; +out: +        afr_read_txn_wind (frame, this, -1); +} +  int  afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)  { -	afr_local_t *local = NULL; -	int read_subvol = 0; -	inode_t *inode = NULL; -	int ret = -1; -        int spb_choice = -1; +        afr_private_t   *priv           = NULL; +        afr_local_t     *local          = NULL; +        int             read_subvol     = -1; +        inode_t         *inode          = NULL; +        int             ret             = -1; +        int             spb_choice      = -1;  	local = frame->local;  	inode = local->inode; +        priv = this->private;          if (err) { -                read_subvol = -1; -                goto readfn; +                if (!priv->thin_arbiter_count) +                        goto readfn; +                if (err != EINVAL) +                        goto readfn; +                /* We need to query the good bricks and/or thin-arbiter.*/ +                afr_ta_read_txn_synctask (frame, this); +                return 0;          }  	read_subvol = afr_read_subvol_select_by_policy (inode, this, @@ -127,7 +335,6 @@ readfn:  	return 0;  } -  int  afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)  { @@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)  	}  } -  /*    afr_read_txn: @@ -207,13 +413,13 @@ int  afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	      afr_read_txn_wind_t readfn, afr_transaction_type type)  { -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; -        unsigned char *data = NULL; -        unsigned char *metadata = NULL; -	int read_subvol = -1; -	int event_generation = 0; -	int ret = -1; +        afr_local_t     *local           = NULL; +        afr_private_t   *priv            = NULL; +        unsigned char   *data            = NULL; +        unsigned char   *metadata        = NULL; +        int             read_subvol      = -1; +        int             event_generation = 0; +        int             ret              = -1;  	priv = this->private;  	local = frame->local; @@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	local->readfn = readfn;  	local->inode = inode_ref (inode);          local->is_read_txn = _gf_true; +        local->transaction.type = type;          if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {                  local->op_ret = -1;                  local->op_errno = afr_quorum_errno(priv); -                read_subvol = -1;                  goto read;          }          if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {                  local->op_ret = -1; -                read_subvol = -1;                  goto read;          } -	local->transaction.type = type; +        if (priv->thin_arbiter_count && +            AFR_COUNT (local->child_up, priv->child_count) != +                       priv->child_count) { +                afr_ta_read_txn_synctask (frame, this); +                return 0; +        } +          ret = afr_inode_read_subvol_get (inode, this, data, metadata,                                           &event_generation);  	if (ret == -1) diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 155dc1d96af..f7b636cb3d9 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1252,4 +1252,7 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc);  gf_boolean_t  afr_is_pending_set (xlator_t *this, dict_t *xdata, int type); + +call_frame_t* +afr_ta_frame_create (xlator_t *this);  #endif /* __AFR_H__ */ | 
