diff options
| -rw-r--r-- | libglusterfs/src/common-utils.c | 24 | ||||
| -rw-r--r-- | libglusterfs/src/common-utils.h | 11 | ||||
| -rw-r--r-- | tests/bugs/replicate/bug-1363721.t | 112 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 125 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 19 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 17 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 16 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 21 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-locks.c | 2 | 
12 files changed, 308 insertions, 51 deletions
| diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 9a5f90b02f1..b62e69cf102 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -4494,3 +4494,27 @@ gf_zero_fill_stat (struct iatt *buf)          buf->ia_nlink = 0;          buf->ia_ctime = 0;  } + +int +gf_bits_count (uint64_t n) +{ +        int val = 0; +#ifdef _GNU_SOURCE +        val = __builtin_popcountll (n); +#else +        n -= (n >> 1) & 0x5555555555555555ULL; +        n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL); +        n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL; +        n += n >> 8; +        n += n >> 16; +        n += n >> 32; +        val = n & 0xFF; +#endif +        return val; +} + +int +gf_bits_index (uint64_t n) +{ +    return ffsll(n) - 1; +} diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index f1c26a2d0c5..93dee58b079 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -25,6 +25,10 @@  #include <limits.h>  #include <fnmatch.h> +#ifndef ffsll +#define ffsll(x) __builtin_ffsll(x) +#endif +  void trap (void);  #define GF_UNIVERSAL_ANSWER 42    /* :O */ @@ -835,4 +839,11 @@ is_virtual_xattr (const char *k);  const char *  gf_inode_type_to_str (ia_type_t type); + +int32_t +gf_bits_count (uint64_t n); + +int32_t +gf_bits_index (uint64_t n); +  #endif /* _COMMON_UTILS_H */ diff --git a/tests/bugs/replicate/bug-1363721.t b/tests/bugs/replicate/bug-1363721.t new file mode 100644 index 00000000000..ec39889b27e --- /dev/null +++ b/tests/bugs/replicate/bug-1363721.t @@ -0,0 +1,112 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +FILE_UPDATE_TIMEOUT=20 +cleanup + +function size_increased { +        local file=$1 +        local size=$2 +        local new_size=$(stat -c%s $file) +        if [ $new_size -gt $size ]; +        then +                echo "Y" +        else +                echo "N" +        fi +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 --direct-io-mode=enable + +cd $M0 + +# Start writing to a file. +(dd if=/dev/urandom of=$M0/file1 bs=1k 2>/dev/null 1>/dev/null)& +dd_pid=$! + +# Let IO happen +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 0 + +# Now kill the zeroth brick +kill_brick $V0 $H0 $B0/${V0}0 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now kill the first brick +kill_brick $V0 $H0 $B0/${V0}1 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +# Let IO continue for 3 seconds +sleep 3 + +# Now kill the second brick +kill_brick $V0 $H0 $B0/${V0}2 + +# At this point the write should have been failed. But make sure that the second +# brick is never an accused. + +md5sum_2=$(md5sum $B0/${V0}2/file1 | awk '{print $1}') + +EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'` +EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'` + +EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.dirty data +EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.dirty data + +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 metadata +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 metadata +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 metadata + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +# Enable shd +TEST $CLI volume set $V0 self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 + +TEST $CLI volume heal $V0 + +# Wait for heal to complete +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'` +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'` +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}2/file1 | awk '{print $1}'` + +cd ~ + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 557b2cd8891..9b2c0d7caea 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -155,6 +155,119 @@ out:   */  int +__afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, +                               inode_t *inode) +{ +        int                 i               = 0; +        int                 ret             = -1; +        int                 txn_type        = 0; +        int                 count           = 0; +        int                 index           = -1; +        uint16_t            datamap_old     = 0; +        uint16_t            metadatamap_old = 0; +        uint16_t            datamap         = 0; +        uint16_t            metadatamap     = 0; +        uint16_t            tmp_map         = 0; +        uint16_t            mask            = 0; +        uint32_t            event           = 0; +        uint64_t            val             = 0; +        afr_private_t      *priv            = NULL; +        afr_inode_ctx_t    *ctx             = NULL; + +        priv = this->private; +        txn_type = local->transaction.type; + +        ret = __afr_inode_ctx_get (this, inode, &ctx); +        if (ret < 0) +                return ret; + +        val = ctx->read_subvol; + +        metadatamap_old = metadatamap = (val & 0x000000000000ffff); +        datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; +        /* Hard-code event to 0 since there is a failure and the inode +         * needs to be refreshed anyway. +         */ +        event = 0; + +        if (txn_type == AFR_DATA_TRANSACTION) +                tmp_map = datamap; +        else if (txn_type == AFR_METADATA_TRANSACTION) +                tmp_map = metadatamap; + +        count = gf_bits_count (tmp_map); + +        if (count == 1) +                index = gf_bits_index (tmp_map); + +        for (i = 0; i < priv->child_count; i++) { +                mask = 0; +                if (!local->transaction.failed_subvols[i]) +                        continue; + +                mask = 1 << i; +                if (txn_type == AFR_METADATA_TRANSACTION) +                        metadatamap &= ~mask; +                else if (txn_type == AFR_DATA_TRANSACTION) +                        datamap &= ~mask; +        } + +        switch (txn_type) { +        case AFR_METADATA_TRANSACTION: +                if ((metadatamap_old != 0) && (metadatamap == 0) && +                    (count == 1)) { +                        local->transaction.in_flight_sb_errno = +                                                local->replies[index].op_errno; +                        local->transaction.in_flight_sb = _gf_true; +                        metadatamap |= (1 << index); +                } +                break; + +        case AFR_DATA_TRANSACTION: +                if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { +                        local->transaction.in_flight_sb_errno = +                                                local->replies[index].op_errno; +                        local->transaction.in_flight_sb = _gf_true; +                        datamap |= (1 << index); +                } +                break; + +        default: +        break; +        } + +        val = ((uint64_t) metadatamap) | +                (((uint64_t) datamap) << 16) | +                (((uint64_t) event) << 32); + +        ctx->read_subvol = val; + +        return ret; +} + +int +afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, inode_t *inode) +{ +        int            ret  = -1; +        afr_private_t *priv = NULL; + +        priv = this->private; + +        /* If this transaction saw no failures, then exit. */ +        if (AFR_COUNT (local->transaction.failed_subvols, +                       priv->child_count) == 0) +                return 0; + +        LOCK (&inode->lock); +        { +                ret = __afr_set_in_flight_sb_status (this, local, inode); +        } +        UNLOCK (&inode->lock); + +        return ret; +} + +int  __afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,  				   unsigned char *data, unsigned char *metadata,  				   int *event_p) @@ -233,12 +346,12 @@ out:  int  __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)  { -	int ret = -1; -	uint16_t datamap = 0; -	uint16_t metadatamap = 0; -	uint32_t event = 0; -	uint64_t val = 0; -        afr_inode_ctx_t *ctx = NULL; +	int               ret         = -1; +	uint16_t          datamap     = 0; +	uint16_t          metadatamap = 0; +	uint32_t          event       = 0; +	uint64_t          val         = 0; +        afr_inode_ctx_t  *ctx         = NULL;  	ret = __afr_inode_ctx_get (this, inode, &ctx);          if (ret) diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index f3de5352d7e..286a5392da6 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -122,8 +122,7 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)  			continue;  		if (local->replies[i].op_ret < 0) {  			if (local->inode) -				afr_inode_read_subvol_reset (local->inode, -							     this); +				afr_inode_read_subvol_reset (local->inode, this);  			if (local->parent)  				afr_inode_read_subvol_reset (local->parent,  							     this); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 76526bcf177..fa29a5e9291 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -38,13 +38,13 @@  static void  __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)  { -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; -	int read_subvol = 0; -	int i = 0; -        afr_read_subvol_args_t args = {0,}; -        struct iatt  *stbuf = NULL; -        int    ret = 0; +	int                       i               = 0; +        int                       ret             = 0; +	int                       read_subvol     = 0; +        struct iatt              *stbuf           = NULL; +	afr_local_t              *local           = NULL; +	afr_private_t            *priv            = NULL; +        afr_read_subvol_args_t    args            = {0,};  	local = frame->local;  	priv = this->private; @@ -94,10 +94,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)  	for (i = 0; i < priv->child_count; i++) {  		if (!local->replies[i].valid)  			continue; -		if (local->replies[i].op_ret < 0) { -			afr_inode_read_subvol_reset (local->inode, this); +		if (local->replies[i].op_ret < 0)  			continue; -		}  		/* Order of checks in the compound conditional  		   below is important. @@ -134,6 +132,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)  	}          afr_txn_arbitrate_fop_cbk (frame, this); +        afr_set_in_flight_sb_status (this, local, local->inode);  } diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index a9f7ad50fed..fae65d9d9a7 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -850,6 +850,13 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)                  goto out;  	} +        if (local->transaction.in_flight_sb) { +                local->op_ret = -1; +                local->op_errno = local->transaction.in_flight_sb_errno; +                afr_changelog_post_op_done (frame, this); +                goto out; +        } +  	xattr = dict_new ();  	if (!xattr) {  		local->op_ret = -1; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 71247c2c573..4bffc30788a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -736,10 +736,17 @@ typedef struct _afr_local {  		gf_boolean_t uninherit_done;  		gf_boolean_t uninherit_value; +                gf_boolean_t in_flight_sb; /* Indicator for occurrence of +                                              split-brain while in the middle of +                                              a txn. */ +                int32_t in_flight_sb_errno; /* This is where the cause of the +                                               failure on the last good copy of +                                               the file is stored. +                                               */ +  		/* @changelog_resume: function to be called after changlogging  		   (either pre-op or post-op) is done  		*/ -  		afr_changelog_resume_t changelog_resume;                  call_frame_t *main_frame; @@ -872,6 +879,10 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,  	afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)  int +afr_inode_ctx_reset_unreadable_subvol (inode_t *inode, xlator_t *this, +                                       int subvol_idx, int txn_type); + +int  afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,                     uuid_t gfid, afr_inode_refresh_cbk_t cbk); @@ -1146,4 +1157,8 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd);  int  afr_get_msg_id (char *op_type); + +int +afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, +                             inode_t *inode);  #endif /* __AFR_H__ */ diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 175f6dfa71f..2e6759a2803 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -74,8 +74,8 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this,              gf_msg (this->name, GF_LOG_INFO, 0,                      EC_MSG_HEAL_SUCCESS, "Heal succeeded on %d/%d "                      "subvolumes", -                    ec_bits_count(mask & ~(good | bad)), -                    ec_bits_count(mask & ~good)); +                    gf_bits_count(mask & ~(good | bad)), +                    gf_bits_count(mask & ~good));          }      } @@ -333,7 +333,7 @@ void ec_complete(ec_fop_data_t * fop)          if (fop->answer == NULL) {              if (!list_empty(&fop->cbk_list)) {                  cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); -                healing_count = ec_bits_count (cbk->mask & fop->healing); +                healing_count = gf_bits_count (cbk->mask & fop->healing);                      /* fop shouldn't be treated as success if it is not                       * successful on at least fop->minimum good copies*/                  if ((cbk->count - healing_count) >= fop->minimum) { @@ -424,7 +424,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)      switch (fop->minimum)      {          case EC_MINIMUM_ALL: -            fop->minimum = ec_bits_count(fop->mask); +            fop->minimum = gf_bits_count(fop->mask);              if (fop->minimum >= ec->fragments)              {                  break; @@ -451,7 +451,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)      ec_trace("SELECT", fop, ""); -    num = ec_bits_count(fop->mask); +    num = gf_bits_count(fop->mask);      if ((num < fop->minimum) && (num < ec->fragments))      {          gf_msg (ec->xl->name, GF_LOG_ERROR, 0, @@ -500,7 +500,7 @@ void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask)      ec_t * ec = fop->xl->private;      int32_t count, idx; -    count = ec_bits_count(mask); +    count = gf_bits_count(mask);      LOCK(&fop->lock); @@ -578,7 +578,7 @@ void ec_dispatch_inc(ec_fop_data_t * fop)      if (ec_child_select(fop))      { -        fop->expected = ec_bits_count(fop->remaining); +        fop->expected = gf_bits_count(fop->remaining);          fop->first = 0;          ec_dispatch_next(fop, 0); @@ -591,7 +591,7 @@ ec_dispatch_all (ec_fop_data_t *fop)          ec_dispatch_start(fop);          if (ec_child_select(fop)) { -                fop->expected = ec_bits_count(fop->remaining); +                fop->expected = gf_bits_count(fop->remaining);                  fop->first = 0;                  ec_dispatch_mask(fop, fop->remaining); diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index c8f904ac51d..7cf8232353d 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -17,10 +17,6 @@  #include "ec-helpers.h"  #include "ec-messages.h" -#ifndef ffsll -#define ffsll(x) __builtin_ffsll(x) -#endif -  static const char * ec_fop_list[] =  {      [-EC_FOP_HEAL] = "HEAL" @@ -96,23 +92,6 @@ void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...)      }  } -int32_t ec_bits_count(uint64_t n) -{ -    n -= (n >> 1) & 0x5555555555555555ULL; -    n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL); -    n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL; -    n += n >> 8; -    n += n >> 16; -    n += n >> 32; - -    return n & 0xFF; -} - -int32_t ec_bits_index(uint64_t n) -{ -    return ffsll(n) - 1; -} -  int32_t ec_bits_consume(uint64_t * n)  {      uint64_t tmp; diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 1f39da2c09f..93d77726089 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -16,8 +16,6 @@  const char * ec_bin(char * str, size_t size, uint64_t value, int32_t digits);  const char * ec_fop_name(int32_t id);  void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...); -int32_t ec_bits_count(uint64_t n); -int32_t ec_bits_index(uint64_t n);  int32_t ec_bits_consume(uint64_t * n);  size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count,                        off_t offset, size_t size); diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c index 0253b51bf5e..ed835f1aadc 100644 --- a/xlators/cluster/ec/src/ec-locks.c +++ b/xlators/cluster/ec/src/ec-locks.c @@ -52,7 +52,7 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)      }      if (error == -1) { -        if (ec_bits_count(locked | notlocked) >= ec->fragments) { +        if (gf_bits_count(locked | notlocked) >= ec->fragments) {              if (notlocked == 0) {                  if (fop->answer == NULL) {                      fop->answer = cbk; | 
