summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xtests/bugs/bug-853690.t94
-rw-r--r--xlators/cluster/afr/src/afr-common.c2
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c61
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h1
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c20
-rw-r--r--xlators/cluster/afr/src/afr.h7
6 files changed, 169 insertions, 16 deletions
diff --git a/tests/bugs/bug-853690.t b/tests/bugs/bug-853690.t
new file mode 100755
index 00000000000..77a581f5444
--- /dev/null
+++ b/tests/bugs/bug-853690.t
@@ -0,0 +1,94 @@
+#!/bin/bash
+#
+# Bug 853690 - Test that short writes do not lead to corruption.
+#
+# Mismanagement of short writes in AFR leads to corruption and immediately
+# detectable split-brain. Write a file to a replica volume using error-gen
+# to cause short writes on one replica.
+#
+# Short writes are also possible during heal. If ignored, the files are marked
+# consistent and silently differ. After reading the file, cause a lookup, wait
+# for self-heal and verify that the afr xattrs do not match.
+#
+########
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+TEST mkdir -p $B0/test{1,2}
+
+# Our graph is a two brick replica with 100% frequency of short writes on one
+# side of the replica. This guarantees a single write fop leads to an out-of-sync
+# situation.
+cat > $B0/test.vol <<EOF
+volume test-posix-0
+ type storage/posix
+ option directory $B0/test1
+end-volume
+
+volume test-error-0
+ type debug/error-gen
+ option failure 100
+ option enable writev
+ option error-no GF_ERROR_SHORT_WRITE
+ subvolumes test-posix-0
+end-volume
+
+volume test-locks-0
+ type features/locks
+ subvolumes test-error-0
+end-volume
+
+volume test-posix-1
+ type storage/posix
+ option directory $B0/test2
+end-volume
+
+volume test-locks-1
+ type features/locks
+ subvolumes test-posix-1
+end-volume
+
+volume test-replicate-0
+ type cluster/replicate
+ option background-self-heal-count 0
+ subvolumes test-locks-0 test-locks-1
+end-volume
+EOF
+
+TEST glusterd
+
+TEST glusterfs --volfile=$B0/test.vol --attribute-timeout=0 --entry-timeout=0 $M0
+
+# Send a single write, guaranteed to be short on one replica, and attempt to
+# read the data back. Failure to detect the short write results in different
+# file sizes and immediate split-brain (EIO).
+TEST dd if=/dev/zero of=$M0/file bs=128k count=1
+TEST dd if=$M0/file of=/dev/null bs=128k count=1
+
+########
+#
+# Test self-heal with short writes...
+#
+########
+
+# Cause a lookup and wait a few seconds for posterity. This self-heal also fails
+# due to a short write.
+TEST ls $M0/file
+
+# Verify the attributes on the healthy replica do not reflect consistency with
+# the other replica.
+TEST "getfattr -n trusted.afr.test-locks-0 $B0/test2/file --only-values > $B0/out1 2> /dev/null"
+TEST "getfattr -n trusted.afr.test-locks-1 $B0/test2/file --only-values > $B0/out2 2> /dev/null"
+TEST ! cmp $B0/out1 $B0/out2
+
+TEST rm -f $B0/out1 $B0/out2
+TEST rm -f $M0/file
+TEST umount $M0
+
+rm -f $B0/test.vol
+rm -rf $B0/test1 $B0/test2
+
+cleanup;
+
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index a1dd4a5ce57..2e339986621 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -879,6 +879,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (local->dict)
dict_unref (local->dict);
+ GF_FREE(local->replies);
+
GF_FREE (local->child_up);
GF_FREE (local->child_errno);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 94700ddae78..d9d3800fc47 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -77,8 +77,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int child_index = (long) cookie;
int call_count = -1;
int read_child = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
local = frame->local;
+ priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -88,31 +91,50 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
if (afr_fop_failed (op_ret, op_errno))
afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- }
-
- local->op_errno = op_errno;
+ /* stage the best case return value for unwind */
+ if ((local->success_count == 0) || (op_ret > local->op_ret)) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret != -1) {
+ if ((local->success_count == 0) ||
+ (child_index == read_child)) {
+ local->cont.writev.prebuf = *prebuf;
+ local->cont.writev.postbuf = *postbuf;
+ }
+ local->success_count++;
+ }
}
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) ||
+ (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+ local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
}
return 0;
@@ -138,6 +160,15 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ local->transaction.unwind(frame, this);
+ local->transaction.resume(frame, this);
+ return 0;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7e684801fae..e01ab366f4f 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -41,6 +41,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_shd_event_t,
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
+ gf_afr_mt_reply_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index 9ad40592088..e03e1cc45f0 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -434,11 +434,20 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
sh->op_failed = 1;
afr_sh_set_error (loop_sh, op_errno);
- }
+ } else if (op_ret < loop_local->cont.writev.vector->iov_len) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "incomplete write to %s on subvolume %s "
+ "(expected %lu, returned %d)", sh_local->loc.path,
+ priv->children[child_index]->name,
+ loop_local->cont.writev.vector->iov_len, op_ret);
+ sh->op_failed = 1;
+ }
call_count = afr_frame_return (loop_frame);
if (call_count == 0) {
+ iobref_unref(loop_local->cont.writev.iobref);
+
sh_loop_return (sh_frame, this, loop_frame,
loop_sh->op_ret, loop_sh->op_errno);
}
@@ -527,8 +536,17 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
sh_loop_return (sh_frame, this, loop_frame, 0, 0);
goto out;
}
+
loop_local->call_count = call_count;
+ /*
+ * We only really need the request size at the moment, but the buffer
+ * is required if we want to issue a retry in the event of a short write.
+ * Therefore, we duplicate the vector and ref the iobref here...
+ */
+ loop_local->cont.writev.vector = iov_dup(vector, count);
+ loop_local->cont.writev.iobref = iobref_ref(iobref);
+
for (i = 0; i < priv->child_count; i++) {
if (!loop_sh->write_needed[i])
continue;
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index a0d1f3a7466..48dfbf37eb8 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -373,6 +373,12 @@ typedef struct _afr_locked_fd {
struct list_head list;
} afr_locked_fd_t;
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+};
+
typedef struct _afr_local {
int uid;
int gid;
@@ -665,6 +671,7 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
gf_boolean_t do_discovery;
+ struct afr_reply *replies;
} afr_local_t;
typedef enum {