diff options
-rwxr-xr-x | tests/bugs/bug-853690.t | 94 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 61 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-algorithm.c | 20 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 |
6 files changed, 169 insertions, 16 deletions
diff --git a/tests/bugs/bug-853690.t b/tests/bugs/bug-853690.t new file mode 100755 index 00000000000..77a581f5444 --- /dev/null +++ b/tests/bugs/bug-853690.t @@ -0,0 +1,94 @@ +#!/bin/bash +# +# Bug 853690 - Test that short writes do not lead to corruption. +# +# Mismanagement of short writes in AFR leads to corruption and immediately +# detectable split-brain. Write a file to a replica volume using error-gen +# to cause short writes on one replica. +# +# Short writes are also possible during heal. If ignored, the files are marked +# consistent and silently differ. After reading the file, cause a lookup, wait +# for self-heal and verify that the afr xattrs do not match. +# +######## + +. $(dirname $0)/../include.rc + +cleanup; + +TEST mkdir -p $B0/test{1,2} + +# Our graph is a two brick replica with 100% frequency of short writes on one +# side of the replica. This guarantees a single write fop leads to an out-of-sync +# situation. +cat > $B0/test.vol <<EOF +volume test-posix-0 + type storage/posix + option directory $B0/test1 +end-volume + +volume test-error-0 + type debug/error-gen + option failure 100 + option enable writev + option error-no GF_ERROR_SHORT_WRITE + subvolumes test-posix-0 +end-volume + +volume test-locks-0 + type features/locks + subvolumes test-error-0 +end-volume + +volume test-posix-1 + type storage/posix + option directory $B0/test2 +end-volume + +volume test-locks-1 + type features/locks + subvolumes test-posix-1 +end-volume + +volume test-replicate-0 + type cluster/replicate + option background-self-heal-count 0 + subvolumes test-locks-0 test-locks-1 +end-volume +EOF + +TEST glusterd + +TEST glusterfs --volfile=$B0/test.vol --attribute-timeout=0 --entry-timeout=0 $M0 + +# Send a single write, guaranteed to be short on one replica, and attempt to +# read the data back. Failure to detect the short write results in different +# file sizes and immediate split-brain (EIO). +TEST dd if=/dev/zero of=$M0/file bs=128k count=1 +TEST dd if=$M0/file of=/dev/null bs=128k count=1 + +######## +# +# Test self-heal with short writes... +# +######## + +# Cause a lookup and wait a few seconds for posterity. This self-heal also fails +# due to a short write. +TEST ls $M0/file + +# Verify the attributes on the healthy replica do not reflect consistency with +# the other replica. +TEST "getfattr -n trusted.afr.test-locks-0 $B0/test2/file --only-values > $B0/out1 2> /dev/null" +TEST "getfattr -n trusted.afr.test-locks-1 $B0/test2/file --only-values > $B0/out2 2> /dev/null" +TEST ! cmp $B0/out1 $B0/out2 + +TEST rm -f $B0/out1 $B0/out2 +TEST rm -f $M0/file +TEST umount $M0 + +rm -f $B0/test.vol +rm -rf $B0/test1 $B0/test2 + +cleanup; + diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a1dd4a5ce57..2e339986621 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -879,6 +879,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); + GF_FREE(local->replies); + GF_FREE (local->child_up); GF_FREE (local->child_errno); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 94700ddae78..d9d3800fc47 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -77,8 +77,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int child_index = (long) cookie; int call_count = -1; int read_child = 0; + afr_private_t *priv = NULL; + int i = 0; local = frame->local; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -88,31 +91,50 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (afr_fop_failed (op_ret, op_errno)) afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - } - - local->op_errno = op_errno; + /* stage the best case return value for unwind */ + if ((local->success_count == 0) || (op_ret > local->op_ret)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + if (op_ret != -1) { + if ((local->success_count == 0) || + (child_index == read_child)) { + local->cont.writev.prebuf = *prebuf; + local->cont.writev.postbuf = *postbuf; + } + local->success_count++; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->transaction.unwind (frame, this); + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } + local->transaction.unwind (frame, this); local->transaction.resume (frame, this); } return 0; @@ -138,6 +160,15 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + local->op_ret = -1; + local->op_errno = ENOMEM; + local->transaction.unwind(frame, this); + local->transaction.resume(frame, this); + return 0; + } for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7e684801fae..e01ab366f4f 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,6 +41,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 9ad40592088..e03e1cc45f0 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -434,11 +434,20 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, sh->op_failed = 1; afr_sh_set_error (loop_sh, op_errno); - } + } else if (op_ret < loop_local->cont.writev.vector->iov_len) { + gf_log(this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + sh->op_failed = 1; + } call_count = afr_frame_return (loop_frame); if (call_count == 0) { + iobref_unref(loop_local->cont.writev.iobref); + sh_loop_return (sh_frame, this, loop_frame, loop_sh->op_ret, loop_sh->op_errno); } @@ -527,8 +536,17 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, sh_loop_return (sh_frame, this, loop_frame, 0, 0); goto out; } + loop_local->call_count = call_count; + /* + * We only really need the request size at the moment, but the buffer + * is required if we want to issue a retry in the event of a short write. + * Therefore, we duplicate the vector and ref the iobref here... + */ + loop_local->cont.writev.vector = iov_dup(vector, count); + loop_local->cont.writev.iobref = iobref_ref(iobref); + for (i = 0; i < priv->child_count; i++) { if (!loop_sh->write_needed[i]) continue; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index a0d1f3a7466..48dfbf37eb8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -373,6 +373,12 @@ typedef struct _afr_locked_fd { struct list_head list; } afr_locked_fd_t; +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; +}; + typedef struct _afr_local { int uid; int gid; @@ -665,6 +671,7 @@ typedef struct _afr_local { mode_t umask; int xflag; gf_boolean_t do_discovery; + struct afr_reply *replies; } afr_local_t; typedef enum { |