diff options
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 23 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 8 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 281 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 24 |
4 files changed, 314 insertions, 22 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 4157174e40c..5b96a7789f9 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -850,6 +850,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } @@ -2675,6 +2677,14 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_up (this, local->fd); + AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, &local->cont.fsync.prebuf, &local->cont.fsync.postbuf, @@ -2713,7 +2723,9 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, local->fd = fd_ref (fd); - afr_delayed_changelog_wake_up (this, fd); + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3872,6 +3884,15 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + ret = 0; out: return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index a83223569d0..ce4fbf22698 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -176,6 +176,9 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { + if (!local->stable_write) + afr_fd_report_unstable_write (this, local->fd); + afr_writev_handle_short_writes (frame, this); /* * Generally inode-write fops do transaction.unwind then @@ -471,6 +474,11 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, local->fd = fd_ref (fd); + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); + afr_open_fd_fix (fd, this); afr_do_writev (frame, this); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d20928d151d..76697f06b5c 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -574,6 +574,29 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, "failed to set pending entry"); } + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + index = afr_index_for_transaction_type (local->transaction.type); + + for (i = 0; i < priv->child_count; i++) { + if (local->pending[i][index] == 0) + return _gf_false; + } + + return _gf_true; +} + + int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { @@ -586,7 +609,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t *fdctx = NULL; dict_t **xattr = NULL; int piggyback = 0; - int index = 0; int nothing_failed = 1; local = frame->local; @@ -622,15 +644,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } - } + nothing_failed = afr_txn_nothing_failed (frame, this); afr_compute_txn_changelog (local , priv); @@ -656,15 +670,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) break; } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ + + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; afr_set_postop_dict (local, this, xattr[i], piggyback, i); @@ -1339,6 +1352,232 @@ afr_delayed_changelog_wake_up_cbk (void *data) } +/* + Check if the frame is destined to get optimized away + with changelog piggybacking +*/ +static gf_boolean_t +is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; + + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); + + if (!afr_txn_nothing_failed (frame, frame->this)) + /* something failed in this transaction, + we will be performing a hard post-op + */ + return _gf_false; + + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; + } + } + } + UNLOCK(&fd->lock); + + return piggyback; +} + + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); + + return 0; +} + +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_changelog_post_op_now (frame, this); + + return 0; +} + + +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, NULL); + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Time to fsync() */ + + afr_changelog_fsync (frame, this); + + return 0; +} + + void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd) { @@ -1374,7 +1613,7 @@ unlock: pthread_mutex_unlock (&fd_ctx->delay_lock); if (prev_frame) { - afr_changelog_post_op_now (prev_frame, this); + afr_changelog_post_op_safe (prev_frame, this); } } @@ -1389,7 +1628,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) if (is_afr_delayed_changelog_post_op_needed (frame, this)) afr_delayed_changelog_post_op (this, frame, local->fd); else - afr_changelog_post_op_now (frame, this); + afr_changelog_post_op_safe (frame, this); } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 978339017a2..5d9f752b95a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -448,6 +448,12 @@ typedef struct _afr_local { int optimistic_change_log; gf_boolean_t delayed_post_op; + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + /* This struct contains the arguments for the "continuation" (scheme-like) of fops @@ -662,6 +668,12 @@ typedef struct _afr_local { afr_transaction_type type; + /* pre-compute the post piggyback status before + entering POST-OP phase + */ + int *postop_piggybacked; + + int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; @@ -723,6 +735,11 @@ typedef struct { gf_timer_t *delay_timer; call_frame_t *delay_frame; int call_child; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; } afr_fd_ctx_t; @@ -1078,4 +1095,11 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } \ } while (0); + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + #endif /* __AFR_H__ */ |