diff options
| -rw-r--r-- | tests/basic/afr/arbiter.t | 64 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 25 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 12 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 183 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 6 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 16 | 
10 files changed, 272 insertions, 47 deletions
diff --git a/tests/basic/afr/arbiter.t b/tests/basic/afr/arbiter.t new file mode 100644 index 00000000000..a9d485cd7b4 --- /dev/null +++ b/tests/basic/afr/arbiter.t @@ -0,0 +1,64 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../afr.rc +cleanup; + +TEST glusterd; +TEST pidof glusterd + +# Non arbiter replica 3 volumes should not have arbiter-count option enabled. +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; +TEST ! stat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count +TEST umount $M0 +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +# Create and mount a replica 3 arbiter volume. +TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; +TEST stat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count +EXPECT "1" cat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count + +# Write data and metadata +TEST `echo hello >> $M0/file` +TEST setfattr -n user.name -v value1  $M0/file + +# Data I/O will fail if arbiter is the only source. +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST `echo "B0 is down, B1 and B2 are sources" >> $M0/file` +TEST setfattr -n user.name -v value2  $M0/file +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST `echo "B2 is down, B3 is the only source, writes will fail" >> $M0/file` +TEST ! cat $M0/file +# Metadata I/O should still succeed. +TEST getfattr -n user.name $M0/file +TEST setfattr -n user.name -v value3 $M0/file + +#shd should not data self-heal from arbiter to the sinks. +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"1") +EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"2") + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST $CLI volume heal $V0 +EXPECT 0 afr_get_pending_heal_count $V0 + +# I/O can resume again. +TEST cat $M0/file +TEST getfattr -n user.name $M0/file +TEST `echo append>> $M0/file` +TEST umount $M0 +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 5654e3ad03d..c6501cda97a 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -993,6 +993,17 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          afr_entry_lockee_cleanup (&local->internal_lock);          GF_FREE (local->transaction.pre_op); + +        GF_FREE (local->transaction.pre_op_sources); +        if (local->transaction.pre_op_xdata) { +                for (i = 0; i < priv->child_count; i++) { +                        if (!local->transaction.pre_op_xdata[i]) +                                continue; +                        dict_unref (local->transaction.pre_op_xdata[i]); +                } +                GF_FREE (local->transaction.pre_op_xdata); +        } +          GF_FREE (local->transaction.eager_lock);          GF_FREE (local->transaction.fop_subvols);          GF_FREE (local->transaction.failed_subvols); @@ -4084,6 +4095,20 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          if (!local->transaction.pre_op)                  goto out; +        if (priv->arbiter_count == 1) { +                local->transaction.pre_op_xdata = +                        GF_CALLOC (sizeof (*local->transaction.pre_op_xdata), +                                   priv->child_count, gf_afr_mt_dict_t); +                if (!local->transaction.pre_op_xdata) +                        goto out; + +                local->transaction.pre_op_sources = +                        GF_CALLOC (sizeof (*local->transaction.pre_op_sources), +                                   priv->child_count, gf_afr_mt_char); +                if (!local->transaction.pre_op_sources) +                        goto out; +        } +          local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols),  						    priv->child_count,  						    gf_afr_mt_char); diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index f7ca9108092..8a2c0e46e40 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -168,6 +168,8 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)  				local->replies[i].postparent2;  		}  	} + +        afr_txn_arbitrate_fop_cbk (frame, this);  } diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index f712166e44d..f9fde44e9e4 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,7 +37,7 @@  #include "protocol-common.h"  #include "afr-transaction.h" - +#include "afr-self-heal.h"  static void  __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) @@ -97,6 +97,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)  			}  		}  	} + +        afr_txn_arbitrate_fop_cbk (frame, this);  } @@ -342,6 +344,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)  {          call_frame_t    *transaction_frame = NULL;          afr_local_t     *local             = NULL; +        afr_private_t   *priv              = NULL;          int             ret   = -1;          int             op_errno = ENOMEM; @@ -350,6 +353,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)                  goto out;          local = frame->local; +        priv = this->private;          transaction_frame->local = local;  	frame->local = NULL; @@ -379,6 +383,12 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)                  local->transaction.start   = local->cont.writev.offset;                  local->transaction.len     = iov_length (local->cont.writev.vector,                                                           local->cont.writev.count); + +                /*Lock entire file to avoid network split brains.*/ +                if (priv->arbiter_count == 1) { +                        local->transaction.start   = 0; +                        local->transaction.len     = 0; +                }          }          ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a8a7326e4ec..7567fe9f851 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -684,6 +684,12 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,  		source = ret; +                if (priv->arbiter_count == 1 && source == ARBITER_BRICK_INDEX && +                    AFR_COUNT (sources, priv->child_count) == 1) { +                        did_sh = _gf_false; +                        goto unlock; +                } +  		ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,  						     locked_replies,  						     locked_replies[source].poststat.ia_size); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 32be2480234..956f075e25b 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -141,6 +141,9 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,                               afr_transaction_type type,                               unsigned char *locked_on, unsigned char *sources,                               unsigned char *sinks, uint64_t *witness); +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, int idx, +                          dict_t *xdata);  int  afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d2111060035..a2023884465 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -15,7 +15,7 @@  #include "afr.h"  #include "afr-transaction.h" - +#include "afr-self-heal.h"  #include <signal.h>  gf_boolean_t @@ -139,14 +139,130 @@ __mark_all_success (call_frame_t *frame, xlator_t *this)  	}  } +void +afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        afr_transaction_type type = -1; +        dict_t *xdata = NULL; +        int **matrix = NULL; +        int idx = -1; +        int i = 0; +        int j = 0; + +        priv = this->private; +        local = frame->local; +        type = local->transaction.type; +        idx = afr_index_for_transaction_type (type); +        matrix = ALLOC_MATRIX (priv->child_count, int); + +        for (i = 0; i < priv->child_count; i++) { +                if (!local->transaction.pre_op_xdata[i]) +                        continue; +                xdata = local->transaction.pre_op_xdata[i]; +                afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); +        } + +        memset (local->transaction.pre_op_sources, 1, priv->child_count); + +        /*If lock or pre-op failed on a brick, it is not a source. */ +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.failed_subvols[i]) +                        local->transaction.pre_op_sources[i] = 0; +        } + +        /* If brick is blamed by others, it is not a source. */ +        for (i = 0; i < priv->child_count; i++) +                for (j = 0; j < priv->child_count; j++) +                        if (matrix[i][j] != 0) +                                local->transaction.pre_op_sources[j] = 0; + +        /*We don't need the xattrs any more. */ +        for (i = 0; i < priv->child_count; i++) +                if (local->transaction.pre_op_xdata[i]) { +                        dict_unref (local->transaction.pre_op_xdata[i]); +                        local->transaction.pre_op_xdata[i] = NULL; +                } +} + +void +afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        gf_boolean_t fop_failed = _gf_false; +        unsigned char *pre_op_sources = NULL; +        int i = 0; + +        local = frame->local; +        priv  = this->private; +        pre_op_sources = local->transaction.pre_op_sources; + +        if (priv->arbiter_count != 1 || local->op_ret < 0) +                return; + +        /* If the fop failed on the brick, it is not a source. */ +        for (i = 0; i < priv->child_count; i++) +                if (local->transaction.failed_subvols[i]) +                        pre_op_sources[i] = 0; + +        switch (AFR_COUNT (pre_op_sources, priv->child_count)) { +        case 1: +                if (pre_op_sources[ARBITER_BRICK_INDEX]) +                        fop_failed = _gf_true; +                break; +        case 0: +                fop_failed = _gf_true; +                break; +        } + +        if (fop_failed) { +                local->op_ret = -1; +                local->op_errno = ENOTCONN; +        } + +        return; +} + +void +afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int pre_op_sources_count = 0; + +        priv = this->private; +        local = frame->local; + +        afr_compute_pre_op_sources (frame, this); +        pre_op_sources_count = AFR_COUNT (local->transaction.pre_op_sources, +                                          priv->child_count); + +        /* If arbiter is the only source, do not proceed. */ +        if (pre_op_sources_count < 2 && +            local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { +                local->internal_lock.lock_cbk = local->transaction.done; +                local->op_ret = -1; +                local->op_errno =  ENOTCONN; +                afr_restore_lk_owner (frame); +                afr_unlock (frame, this); +        } else { +                local->transaction.fop (frame, this); +        } + +        return; +}  int  afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)  {          afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL;          fd_t            *fd   = NULL;          local = frame->local; +        priv = this->private;          fd    = local->fd;          /*  Perform fops with the lk-owner from top xlator. @@ -172,12 +288,15 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)          */          if (fd)                  afr_delayed_changelog_wake_up (this, fd); -        local->transaction.fop (frame, this); +        if (priv->arbiter_count == 1) { +                afr_txn_arbitrate_fop (frame, this); +        } else { +                local->transaction.fop (frame, this); +        }  	return 0;  } -  static int  __changelog_enabled (afr_private_t *priv, afr_transaction_type type)  { @@ -372,11 +491,16 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)  {          afr_private_t *priv = NULL;          afr_local_t *local = NULL; +        int pre_op_count = 0;          int i = 0;          local = frame->local;  	priv = this->private; +        pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); +        if (pre_op_count < priv->child_count) +                return _gf_false; +          for (i = 0; i < priv->child_count; i++) {                  if (local->transaction.failed_subvols[i])                          return _gf_false; @@ -591,9 +715,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)  	else  		need_undirty = _gf_true; -        //If the fop fails on all the subvols then pending markers are placed -        //for every subvol on all subvolumes. Which is nothing but split-brain. -        //Avoid this by not doing post-op in case of failures.          if (local->op_ret < 0) {                  afr_changelog_post_op_done (frame, this);                  goto out; @@ -846,12 +967,22 @@ afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)  {          afr_local_t *local = NULL; +        afr_private_t *priv = NULL;          int call_count = -1; +        int child_index = -1;          local = frame->local; +        priv = this->private; +        child_index = (long) cookie;  	if (op_ret == -1) -		afr_transaction_fop_failed (frame, this, (long) cookie); +		afr_transaction_fop_failed (frame, this, child_index); + +        if (priv->arbiter_count == 1 && !op_ret) { +                if (xattr) +                        local->transaction.pre_op_xdata[child_index] = +                                                               dict_ref (xattr); +        }  	call_count = afr_frame_return (frame); @@ -964,7 +1095,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          afr_local_t *local = NULL;          afr_internal_lock_t *int_lock = NULL;          unsigned char       *locked_nodes = NULL; -	unsigned char       *pending_subvols = NULL;  	int idx = -1;  	gf_boolean_t pre_nop = _gf_true;  	dict_t *xdata_req = NULL; @@ -975,15 +1105,13 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); -	pending_subvols = alloca0 (priv->child_count); -  	for (i = 0; i < priv->child_count; i++) {  		if (locked_nodes[i]) {  			local->transaction.pre_op[i] = 1;  			call_count++;  		} else { -			pending_subvols[i] = 1; -		} +                        local->transaction.failed_subvols[i] = 1; +                }  	}          /* This condition should not be met with present code, as @@ -1009,28 +1137,21 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  		goto err;  	} -	pre_nop = _gf_true; -  	if (afr_changelog_pre_op_inherit (frame, this))  		goto next; -	if (call_count < priv->child_count) { -		/* For subvols we are not performing operation on, -		   mark them as pending up-front along with the FOP -		   so that we can safely defer unmarking dirty until -		   later. -		*/ -		for (i = 0; i < priv->child_count; i++) { -			if (pending_subvols[i]) -				local->pending[i][idx] = hton32(1); -		} -		ret = afr_set_pending_dict (priv, xdata_req, -					    local->pending); -		if (ret < 0) { -			op_errno = ENOMEM; -			goto err; -		} -		pre_nop = _gf_false; +        if (call_count < priv->child_count) +                pre_nop = _gf_false; + +        /* Set an all-zero pending changelog so that in the cbk, we can get the +         * current on-disk values. In a replica 3 volume with arbiter enabled, +         * these values are needed to arrive at a go/ no-go of the fop phase to +         * avoid ending up in split-brain.*/ + +        ret = afr_set_pending_dict (priv, xdata_req, local->pending); +	if (ret < 0) { +		op_errno = ENOMEM; +		goto err;  	}  	if (call_count > 1 && diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index c3ce333b771..47d43d88991 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -16,6 +16,8 @@  void  afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,  			    int child_index); +void +afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this);  int  afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f7bc6ea0f94..6cb708ffbd7 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,6 +39,8 @@  #define AFR_DOM_COUNT_MAX    3  #define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/ +#define ARBITER_BRICK_INDEX 2 +  typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);  typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); @@ -656,6 +658,10 @@ typedef struct _afr_local {                  unsigned char   *pre_op; +                /* For arbiter configuration only. */ +                dict_t **pre_op_xdata; +                unsigned char *pre_op_sources; +  		/* @fop_subvols: subvolumes on which FOP will be attempted */                  unsigned char   *fop_subvols; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 109e7c214d4..eaede5070b4 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -3985,19 +3985,6 @@ out:  }  static int -volgen_graph_build_replicate_clusters (volgen_graph_t *graph, -                                       glusterd_volinfo_t *volinfo) -{ -        char               *replicate_args[]   = {"cluster/replicate", -                                                  "%s-replicate-%d"}; - -        return volgen_link_bricks_from_list_tail (graph, volinfo, "cluster/replicate", -                                            "%s-replicate-%d", -                                            volinfo->brick_count, -                                            volinfo->replica_count); -} - -static int  build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                      dict_t *set_dict)  { @@ -4011,8 +3998,7 @@ build_shd_clusters (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,          switch (volinfo->type) {          case GF_CLUSTER_TYPE_REPLICATE:          case GF_CLUSTER_TYPE_STRIPE_REPLICATE: -                clusters = volgen_graph_build_replicate_clusters (graph, -                                                                  volinfo); +                clusters = volgen_graph_build_afr_clusters (graph, volinfo);                  break;          case GF_CLUSTER_TYPE_DISPERSE:  | 
