summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/bugs/bug-888174.t4
-rw-r--r--xlators/cluster/afr/src/afr-common.c23
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c8
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c281
-rw-r--r--xlators/cluster/afr/src/afr.h24
5 files changed, 316 insertions, 24 deletions
diff --git a/tests/bugs/bug-888174.t b/tests/bugs/bug-888174.t
index 6cf8d04ad..76ca3c961 100644
--- a/tests/bugs/bug-888174.t
+++ b/tests/bugs/bug-888174.t
@@ -36,14 +36,14 @@ inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {
TEST [ -z $inodelk_max_latency ]
-TEST dd of=$M0/a if=/dev/urandom bs=1M count=10
+TEST dd of=$M0/a if=/dev/urandom bs=1M count=10 conv=fsync
#Check for no trace of pending changelog. Flush should make sure of it.
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1
-dd of=$M0/a if=/dev/urandom bs=1M count=1024 2>/dev/null &
+dd of=$M0/a if=/dev/urandom bs=1M count=1024 oflag=sync 2>/dev/null &
p=$!
#trigger graph switches, tests for fsync not leaving any pending flags
TEST $CLI volume set $V0 performance.quick-read off
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4157174e4..5b96a7789 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -850,6 +850,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
loc_wipe (&local->transaction.parent_loc);
loc_wipe (&local->transaction.new_parent_loc);
+
+ GF_FREE (local->transaction.postop_piggybacked);
}
@@ -2675,6 +2677,14 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ /* If no new unstable writes happened between the
+ time we cleared the unstable write witness flag in afr_fsync
+ and now, calling afr_delayed_changelog_wake_up() should
+ wake up and skip over the fsync phase and go straight to
+ afr_changelog_post_op_now()
+ */
+ afr_delayed_changelog_wake_up (this, local->fd);
+
AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
&local->cont.fsync.prebuf,
&local->cont.fsync.postbuf,
@@ -2713,7 +2723,9 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
- afr_delayed_changelog_wake_up (this, fd);
+ if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3872,6 +3884,15 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count,
+ sizeof (int),
+ gf_afr_mt_int32_t);
+ if (!local->transaction.postop_piggybacked) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
ret = 0;
out:
return ret;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index a83223569..ce4fbf226 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -176,6 +176,9 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) {
+ if (!local->stable_write)
+ afr_fd_report_unstable_write (this, local->fd);
+
afr_writev_handle_short_writes (frame, this);
/*
* Generally inode-write fops do transaction.unwind then
@@ -471,6 +474,11 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
+
afr_open_fd_fix (fd, this);
afr_do_writev (frame, this);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index d20928d15..76697f06b 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -574,6 +574,29 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
"failed to set pending entry");
}
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int index = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ index = afr_index_for_transaction_type (local->transaction.type);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][index] == 0)
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
@@ -586,7 +609,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
afr_fd_ctx_t *fdctx = NULL;
dict_t **xattr = NULL;
int piggyback = 0;
- int index = 0;
int nothing_failed = 1;
local = frame->local;
@@ -622,15 +644,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
goto out;
}
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
- }
+ nothing_failed = afr_txn_nothing_failed (frame, this);
afr_compute_txn_changelog (local , priv);
@@ -656,15 +670,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
break;
}
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
+ /* local->transaction.postop_piggybacked[] was
+ precomputed in is_piggyback_postop() when called from
+ afr_changelog_post_op_safe()
+ */
+
+ piggyback = 0;
+ if (local->transaction.postop_piggybacked[i])
+ piggyback = 1;
afr_set_postop_dict (local, this, xattr[i],
piggyback, i);
@@ -1339,6 +1352,232 @@ afr_delayed_changelog_wake_up_cbk (void *data)
}
+/*
+ Check if the frame is destined to get optimized away
+ with changelog piggybacking
+*/
+static gf_boolean_t
+is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t piggyback = _gf_true;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ fdctx = afr_fd_ctx_get (fd, frame->this);
+
+ if (!afr_txn_nothing_failed (frame, frame->this))
+ /* something failed in this transaction,
+ we will be performing a hard post-op
+ */
+ return _gf_false;
+
+ LOCK(&fd->lock);
+ {
+ piggyback = _gf_true;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ local->transaction.postop_piggybacked[i] = 1;
+ } else {
+ /* For at least _one_ subvolume we cannot
+ piggyback on the changelog, and have to
+ perform a hard POST-OP and therefore fsync
+ if necesssary
+ */
+ piggyback = _gf_false;
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ return piggyback;
+}
+
+
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
+
+ return 0;
+}
+
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return witness;
+}
+
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (afr_fop_failed (op_ret, op_errno)) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_changelog_post_op_now (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, NULL);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ if (is_piggyback_post_op (frame, local->fd)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Time to fsync() */
+
+ afr_changelog_fsync (frame, this);
+
+ return 0;
+}
+
+
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
{
@@ -1374,7 +1613,7 @@ unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
if (prev_frame) {
- afr_changelog_post_op_now (prev_frame, this);
+ afr_changelog_post_op_safe (prev_frame, this);
}
}
@@ -1389,7 +1628,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
if (is_afr_delayed_changelog_post_op_needed (frame, this))
afr_delayed_changelog_post_op (this, frame, local->fd);
else
- afr_changelog_post_op_now (frame, this);
+ afr_changelog_post_op_safe (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 978339017..5d9f752b9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -448,6 +448,12 @@ typedef struct _afr_local {
int optimistic_change_log;
gf_boolean_t delayed_post_op;
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
@@ -662,6 +668,12 @@ typedef struct _afr_local {
afr_transaction_type type;
+ /* pre-compute the post piggyback status before
+ entering POST-OP phase
+ */
+ int *postop_piggybacked;
+
+
int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
@@ -723,6 +735,11 @@ typedef struct {
gf_timer_t *delay_timer;
call_frame_t *delay_frame;
int call_child;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
} afr_fd_ctx_t;
@@ -1078,4 +1095,11 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
} \
} while (0);
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
#endif /* __AFR_H__ */