cluster/afr: ensure DATA operations are made durable before POST-OP

The changelogging scheme of AFR stores information about the state of all replicas in all replicas (in the extended attribute of the respective files on each server) in the form of 'pending counts' of operations (effectively "dirty flags"). These xattrs are blindly trusted while performing self-heal, and therefore utmost care has to be taken while updating and maintaing them. The most critical updation is the clearing of the pending counts corresponding to the *other* server in the changelog of a given server. Before clearing the pending count, we need durability guarantee of the write which was performed on the other server. To obtain such a guarantee, it may be necessary to explicitly introduce an fsync() phase (if the file itself wasn't already opened with O_SYNC). This patch introduces the detection of unstable stable writes on a file and issues explicit fsync() on the servers before performing the POST-OP clearing of pending flags. Change-Id: I2171b86a74ec91e40e5877eef0a4e7379578ecf7 BUG: 927146 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/4721 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
author: Anand Avati <avati@redhat.com> 2013-03-24 12:19:56 -0700
committer: Anand Avati <avati@redhat.com> 2013-03-27 22:15:53 -0700
commit: ca10fdc81a72a71ac67ac9fc8c5ad5b92febd875 (patch)
tree: 7c8cef642d283bf3a132499a1c82f3351ced475f /xlators/cluster/afr/src/afr-transaction.c
parent: 1f7dadccd45863ebea8f60339f297ac551e89899 (diff)
1 files changed, 260 insertions, 21 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index d20928d15..76697f06b 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -574,6 +574,29 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
                         "failed to set pending entry");
 }
 
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+	afr_private_t *priv = NULL;
+	afr_local_t *local = NULL;
+	int index = -1;
+	int i = 0;
+
+	local = frame->local;
+	priv = this->private;
+
+        index = afr_index_for_transaction_type (local->transaction.type);
+
+	for (i = 0; i < priv->child_count; i++) {
+                if (local->pending[i][index] == 0)
+			return _gf_false;
+        }
+
+	return _gf_true;
+}
+
+
 int
 afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
 {
@@ -586,7 +609,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
         afr_fd_ctx_t  *fdctx = NULL;
         dict_t        **xattr = NULL;
         int            piggyback = 0;
-        int            index = 0;
         int            nothing_failed = 1;
 
         local    = frame->local;
@@ -622,15 +644,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
                 goto out;
         }
 
-        /* check if something has failed, to handle piggybacking */
-        nothing_failed = 1;
-        index = afr_index_for_transaction_type (local->transaction.type);
-        for (i = 0; i < priv->child_count; i++) {
-                if (local->pending[i][index] == 0) {
-                        nothing_failed = 0;
-                        break;
-                }
-        }
+	nothing_failed = afr_txn_nothing_failed (frame, this);
 
         afr_compute_txn_changelog (local , priv);
 
@@ -656,15 +670,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
                                 break;
                         }
 
-                        LOCK (&local->fd->lock);
-                        {
-                                piggyback = 0;
-                                if (fdctx->pre_op_piggyback[i]) {
-                                        fdctx->pre_op_piggyback[i]--;
-                                        piggyback = 1;
-                                }
-                        }
-                        UNLOCK (&local->fd->lock);
+			/* local->transaction.postop_piggybacked[] was
+			   precomputed in is_piggyback_postop() when called from
+			   afr_changelog_post_op_safe()
+			*/
+
+			piggyback = 0;
+			if (local->transaction.postop_piggybacked[i])
+				piggyback = 1;
 
                         afr_set_postop_dict (local, this, xattr[i],
                                              piggyback, i);
@@ -1339,6 +1352,232 @@ afr_delayed_changelog_wake_up_cbk (void *data)
 }
 
 
+/*
+  Check if the frame is destined to get optimized away
+  with changelog piggybacking
+*/
+static gf_boolean_t
+is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
+{
+	afr_fd_ctx_t *fdctx = NULL;
+	afr_local_t *local = NULL;
+	gf_boolean_t piggyback = _gf_true;
+	afr_private_t *priv = NULL;
+	int i = 0;
+
+	priv = frame->this->private;
+	local = frame->local;
+	fdctx = afr_fd_ctx_get (fd, frame->this);
+
+	if (!afr_txn_nothing_failed (frame, frame->this))
+		/* something failed in this transaction,
+		   we will be performing a hard post-op
+		*/
+		return _gf_false;
+
+	LOCK(&fd->lock);
+	{
+		piggyback = _gf_true;
+
+		for (i = 0; i < priv->child_count; i++) {
+			if (!local->transaction.pre_op[i])
+				continue;
+			if (fdctx->pre_op_piggyback[i]) {
+				fdctx->pre_op_piggyback[i]--;
+				local->transaction.postop_piggybacked[i] = 1;
+			} else {
+				/* For at least _one_ subvolume we cannot
+				   piggyback on the changelog, and have to
+				   perform a hard POST-OP and therefore fsync
+				   if necesssary
+				*/
+				piggyback = _gf_false;
+			}
+		}
+	}
+	UNLOCK(&fd->lock);
+
+	return piggyback;
+}
+
+
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+	afr_fd_ctx_t *fdctx = NULL;
+
+	fdctx = afr_fd_ctx_get (fd, this);
+
+	LOCK(&fd->lock);
+	{
+		fdctx->witnessed_unstable_write = _gf_true;
+	}
+	UNLOCK(&fd->lock);
+
+	return 0;
+}
+
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+	afr_fd_ctx_t *fdctx = NULL;
+	gf_boolean_t witness = _gf_false;
+
+	fdctx = afr_fd_ctx_get (fd, this);
+
+	LOCK(&fd->lock);
+	{
+		if (fdctx->witnessed_unstable_write) {
+			witness = _gf_true;
+			fdctx->witnessed_unstable_write = _gf_false;
+		}
+	}
+	UNLOCK (&fd->lock);
+
+	return witness;
+}
+
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+			 int op_ret, int op_errno, struct iatt *pre,
+			 struct iatt *post, dict_t *xdata)
+{
+	afr_private_t *priv = NULL;
+        int child_index = (long) cookie;
+	int call_count = -1;
+	afr_local_t *local = NULL;
+
+	priv = this->private;
+	local = frame->local;
+
+	if (afr_fop_failed (op_ret, op_errno)) {
+		/* Failure of fsync() is as good as failure of previous
+		   write(). So treat it like one.
+		*/
+		gf_log (this->name, GF_LOG_WARNING,
+			"fsync(%s) failed on subvolume %s. Transaction was %s",
+			uuid_utoa (local->fd->inode->gfid),
+			priv->children[child_index]->name,
+			gf_fop_list[local->op]);
+
+		afr_transaction_fop_failed (frame, this, child_index);
+	}
+
+        call_count = afr_frame_return (frame);
+
+        if (call_count == 0)
+		afr_changelog_post_op_now (frame, this);
+
+	return 0;
+}
+
+
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+	afr_local_t *local = NULL;
+	int i = 0;
+	int call_count = 0;
+	afr_private_t *priv = NULL;
+
+	local = frame->local;
+	priv = this->private;
+
+        call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+                                                     priv->child_count);
+
+	if (!call_count) {
+		/* will go straight to unlock */
+		afr_changelog_post_op_now (frame, this);
+		return 0;
+	}
+
+	local->call_count = call_count;
+
+	for (i = 0; i < priv->child_count; i++) {
+		if (!local->transaction.pre_op[i])
+			continue;
+
+		STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+				   (void *) (long) i, priv->children[i],
+				   priv->children[i]->fops->fsync, local->fd,
+				   1, NULL);
+		if (!--call_count)
+			break;
+	}
+
+	return 0;
+}
+
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+	afr_local_t    *local = NULL;
+
+	local = frame->local;
+
+	if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+		afr_changelog_post_op_now (frame, this);
+		return 0;
+	}
+
+	if (is_piggyback_post_op (frame, local->fd)) {
+		/* just detected that this post-op is about to
+		   be optimized away as a new write() has
+		   already piggybacked on this frame's changelog.
+		*/
+		afr_changelog_post_op_now (frame, this);
+		return 0;
+	}
+
+	/* Calling afr_changelog_post_op_now() now will result in
+	   issuing ->[f]xattrop().
+
+	   Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+	   responsible operation that what it might appear on the surface.
+
+	   The changelog of a file (in the xattr of the file on the server)
+	   stores information (pending count) about the state of the file
+	   on the OTHER server. This changelog is blindly trusted, and must
+	   therefore be updated in such a way it remains trustworthy. This
+	   implies that decrementing the pending count (essentially "clearing
+	   the dirty flag") must be done STRICTLY after we are sure that the
+	   operation on the other server has reached stable storage.
+
+	   While the backend filesystem on that server will eventually flush
+	   it to stable storage, we (being in userspace) have no mechanism
+	   to get notified when the write became "stable".
+
+	   This means we need take matter into our own hands and issue an
+	   fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+	   and get an acknowledgement for it. And we need to wait for the
+	   fsync() acknowledgement before initiating the hard POST-OP.
+
+	   However if the FD itself was opened in O_SYNC or O_DSYNC then
+	   we are already guaranteed that the writes were made stable as
+	   part of the FOP itself. The same holds true for NFS stable
+	   writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+	   flag set in the writev() @flags param. For all other write types,
+	   mark a flag in the fdctx whenever an unstable write is witnessed.
+	*/
+
+	if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+		afr_changelog_post_op_now (frame, this);
+		return 0;
+	}
+
+	/* Time to fsync() */
+
+	afr_changelog_fsync (frame, this);
+
+	return 0;
+}
+
+
 void
 afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
 {
@@ -1374,7 +1613,7 @@ unlock:
 	pthread_mutex_unlock (&fd_ctx->delay_lock);
 
 	if (prev_frame) {
-		afr_changelog_post_op_now (prev_frame, this);
+		afr_changelog_post_op_safe (prev_frame, this);
 	}
 }
 
@@ -1389,7 +1628,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
 	if (is_afr_delayed_changelog_post_op_needed (frame, this))
 		afr_delayed_changelog_post_op (this, frame, local->fd);
 	else
-		afr_changelog_post_op_now (frame, this);
+		afr_changelog_post_op_safe (frame, this);
 }
author	Anand Avati <avati@redhat.com>	2013-03-24 12:19:56 -0700
committer	Anand Avati <avati@redhat.com>	2013-03-27 22:15:53 -0700
commit	ca10fdc81a72a71ac67ac9fc8c5ad5b92febd875 (patch)
tree	7c8cef642d283bf3a132499a1c82f3351ced475f /xlators/cluster/afr/src/afr-transaction.c
parent	1f7dadccd45863ebea8f60339f297ac551e89899 (diff)