diff options
| author | Anand Avati <avati@redhat.com> | 2013-07-24 03:53:16 -0700 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2013-08-13 23:45:03 -0700 | 
| commit | 8360037701788d49471cc0228fa873aa18382023 (patch) | |
| tree | 6c0aff80595683322507102ddb04986915511729 | |
| parent | 0d756dc618c1a4b659a3531aec449506ce577f50 (diff) | |
afr: treat appending writes as stable writes.
Durability of appending writes is implicit in the file size. Therefore
performing an explicit fsync() is unnecessary in such cases as self-heal
can check for the size of file when pending changelog is not unambiguous.
Change-Id: I05446180a91d20e0dbee5de5a7085b87d57f178a
BUG: 927146
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/5501
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 23 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 5 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 41 | 
5 files changed, 69 insertions, 3 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 763968c9..324e3f5b 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -107,6 +107,7 @@  #define ZR_FILE_CONTENT_STR     "glusterfs.file."  #define ZR_FILE_CONTENT_STRLEN 15 +#define GLUSTERFS_WRITE_IS_APPEND "glusterfs.write-is-append"  #define GLUSTERFS_OPEN_FD_COUNT "glusterfs.open-fd-count"  #define GLUSTERFS_INODELK_COUNT "glusterfs.inodelk-count"  #define GLUSTERFS_ENTRYLK_COUNT "glusterfs.entrylk-count" diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 03025641..691c1d4d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4487,6 +4487,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)                  goto out;          } +	local->append_write = _gf_false; +          ret = 0;  out:          return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 68570f15..a7441676 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -139,6 +139,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          int read_child  = 0;          int      ret = 0;          uint32_t open_fd_count = 0; +        uint32_t write_is_append = 0;          local = frame->local; @@ -173,6 +174,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                          local->open_fd_count = open_fd_count;                                          local->update_open_fd_count = _gf_true;                                  } + +				write_is_append = 0; +                                ret = dict_get_uint32 (xdata, +                                                       GLUSTERFS_WRITE_IS_APPEND, +                                                       &write_is_append); +                                if (ret || !write_is_append) +					local->append_write = _gf_false;                          }  			if ((local->success_count == 0) || @@ -192,7 +200,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  if (local->update_open_fd_count)                          afr_handle_open_fd_count (frame, this); -                if (!local->stable_write) +                if (!local->stable_write && !local->append_write) +			/* An appended write removes the necessity to +			   fsync() the file. This is because self-heal +			   has the logic to check for larger file when +			   the xattrs are not reliably pointing at +			   a stale file. +			*/                          afr_fd_report_unstable_write (this, local->fd);                  afr_writev_handle_short_writes (frame, this); @@ -251,6 +265,13 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)          if (xdata) {                  ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,                                         sizeof (uint32_t)); +		ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, +				       0); +		/* Set append_write to be true speculatively. If on any +		   server it turns not be true, we unset it in the +		   callback. +		*/ +		local->append_write = _gf_true;          }          for (i = 0; i < priv->child_count; i++) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 49d281ac..2023613f 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -496,6 +496,11 @@ typedef struct _afr_local {  	*/  	gf_boolean_t      stable_write; +	/* This write appended to the file. Nnot necessarily O_APPEND, +	   just means the offset of write was at the end of file. +	*/ +	gf_boolean_t      append_write; +          /*            This struct contains the arguments for the "continuation"            (scheme-like) of fops diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 49d1effb..fc7c259e 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2199,7 +2199,7 @@ err:  }  dict_t* -_fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this) +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)  {          dict_t  *rsp_xdata = NULL;          int32_t ret = 0; @@ -2229,6 +2229,14 @@ _fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this)                          "dictionary value for %s", uuid_utoa (fd->inode->gfid),                          GLUSTERFS_OPEN_FD_COUNT);          } + +        ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, +                               is_append); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " +                        "dictionary value for %s", uuid_utoa (fd->inode->gfid), +                        GLUSTERFS_WRITE_IS_APPEND); +        }  out:          return rsp_xdata;  } @@ -2247,6 +2255,8 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          struct iatt            postop    = {0,};          int                      ret      = -1;          dict_t                *rsp_xdata = NULL; +	int                    is_append = 0; +	gf_boolean_t           locked = _gf_false;          VALIDATE_OR_GOTO (frame, out);          VALIDATE_OR_GOTO (this, out); @@ -2268,6 +2278,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          _fd = pfd->fd; +	if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { +		/* The write_is_append check and write must happen +		   atomically. Else another write can overtake this +		   write after the check and get written earlier. + +		   So lock before preop-stat and unlock after write. +		*/ +		locked = _gf_true; +		LOCK(&fd->inode->lock); +	} +          op_ret = posix_fdstat (this, _fd, &preop);          if (op_ret == -1) {                  op_errno = errno; @@ -2277,8 +2298,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,                  goto out;          } +	if (locked) { +		if (preop.ia_size == offset || (fd->flags & O_APPEND)) +			is_append = 1; +	} +          op_ret = __posix_writev (_fd, vector, count, offset,                                   (pfd->flags & O_DIRECT)); + +	if (locked) { +		UNLOCK (&fd->inode->lock); +		locked = _gf_false; +	} +          if (op_ret < 0) {                  op_errno = -op_ret;                  op_ret = -1; @@ -2294,7 +2326,7 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          UNLOCK (&priv->lock);          if (op_ret >= 0) { -                rsp_xdata = _fill_open_fd_count (fd, xdata, this); +                rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append);                  /* wiretv successful, we also need to get the stat of                   * the file we wrote to                   */ @@ -2324,6 +2356,11 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,  out: +	if (locked) { +		UNLOCK (&fd->inode->lock); +		locked = _gf_false; +	} +          STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop,                               rsp_xdata);  | 
