diff options
| author | Anand Avati <avati@redhat.com> | 2012-09-13 22:26:59 -0700 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2012-10-01 12:45:18 -0700 | 
| commit | c903de38da917239fe905fc6efa1f413d120fc04 (patch) | |
| tree | 6387677f13c700305e704dad7d274f656526287b | |
| parent | dd8eb06e26138e149252365043706067782416af (diff) | |
write-behind: implement causal ordering and other cleanup
Rules of causal ordering implemented:
- If request A arrives after the acknowledgement (to the app,
  i.e, STACK_UNWIND) of another request B, then request B is
  said to have 'caused' request A.
- (corollary) Two requests, which at any point of time, are
  unacknowledged simultaneously in the system can never 'cause'
  each other (wb_inode->gen is based on this)
- If request A is caused by request B, AND request A's region
  has an overlap with request B's region, then then the fulfillment
  of request A is guaranteed to happen after the fulfillment of B.
- FD of origin is not considered for the determination of causal
  ordering.
- Append operation's region is considered the whole file.
Other cleanup:
- wb_file_t not required any more.
- wb_local_t not required any more.
- O_RDONLY fd's operations now go through the queue to make sure
  writes in the requested region get fulfilled before getting
  processed.
- O_SYNC fd's operations now go through the queue to make sure
  previously acknowledged writes on the file (via other fds) are
  fulfilled before getting processed.
- Option to not honor O_SYNC is now removed.
- Option to ignore O_DIRECT is added (useful when running a VM and the
  drive appears with NCQ/TCQ or WCE=1 for the guest.)
- Option to disable_first_nbytes is removed (as the cause of the
  bug which required this was diagnosed to be missing TCP_NODELAY.)
- General cleanup and better conformance to coding style and convention.
Change-Id: Ib44fb72da3727246b4a85174cb568c2f0231f6de
BUG: 857673
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/3947
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-by: Amar Tumballi <amarts@redhat.com>
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 2 | ||||
| -rw-r--r-- | xlators/performance/write-behind/src/write-behind.c | 3486 | 
2 files changed, 1070 insertions, 2418 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 685470d6..acfe8386 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -164,6 +164,8 @@ static struct volopt_map_entry glusterd_volopt_map[] = {          {"performance.disk-usage-limit",         "performance/quota",         NULL, NULL, NO_DOC, 0},          {"performance.min-free-disk-limit",      "performance/quota",         NULL, NULL, NO_DOC, 0},          {"performance.write-behind-window-size", "performance/write-behind",  "cache-size", NULL, DOC}, +        {"performance.strict-o-direct",          "performance/write-behind",  "strict-O_DIRECT", NULL, DOC}, +        {"performance.strict-write-ordering",    "performance/write-behind",  "strict-write-ordering", NULL, DOC},          {"performance.read-ahead-page-count",    "performance/read-ahead",    "page-count", NULL, DOC},          {"network.frame-timeout",                "protocol/client",           NULL, NULL, NO_DOC, 0}, diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index ad1e5f03..53506d94 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -8,8 +8,6 @@    cases as published by the Free Software Foundation.  */ -/*TODO: check for non null wb_file_data before getting wb_file */ -  #ifndef _CONFIG_H  #define _CONFIG_H @@ -26,6 +24,7 @@  #include "common-utils.h"  #include "call-stub.h"  #include "statedump.h" +#include "defaults.h"  #include "write-behind-mem-types.h"  #define MAX_VECTOR_COUNT          8 @@ -34,96 +33,146 @@  typedef struct list_head list_head_t;  struct wb_conf; -struct wb_page;  struct wb_inode;  typedef struct wb_inode { -        size_t       window_conf; -        size_t       window_current; -        size_t       aggregate_current; -        int32_t      op_ret; +        ssize_t      window_conf; +        ssize_t      window_current; +	ssize_t      transit; /* size of data stack_wound, and yet +				 to be fulfilled (wb_fulfill_cbk). +				 used for trickling_writes +			      */ + +        int32_t      op_ret;   /* Last found op_ret and op_errno +				  while completing a liability +				  operation. Will be picked by +				  the next arriving writev/flush/fsync +			       */          int32_t      op_errno; -        list_head_t  request; -        list_head_t  passive_requests; + +        list_head_t  all;    /* All requests, from enqueue() till destroy(). +				Used only for resetting generation +				number when empty. +			     */ +        list_head_t  todo;   /* Work to do (i.e, STACK_WIND to server). +				Once we STACK_WIND, the entry is taken +				off the list. If it is non-sync write, +				then we continue to track it via @liability +				or @temptation depending on the status +				of its writeback. +			     */ +        list_head_t  liability;   /* Non-sync writes which are lied +				     (STACK_UNWIND'ed to caller) but ack +				     from server not yet complete. This +				     is the "liability" which we hold, and +				     must guarantee that dependent operations +				     which arrive later (which overlap, etc.) +				     are issued only after their dependencies +				     in this list are "fulfilled". + +				     Server acks for entries in this list +				     shrinks the window. + +				     The sum total of all req->write_size +				     of entries in this list must be kept less +				     than the permitted window size. +				  */ +        list_head_t  temptation;  /* Operations for which we are tempted +				     to 'lie' (write-behind), but temporarily +				     holding off (because of insufficient +				     window capacity, etc.) + +				     This is the list to look at to grow +				     the window (in __wb_pick_unwinds()). + +				     Entries typically get chosen from +				     write-behind from this list, and therefore +				     get "upgraded" to the "liability" list. +			     */ +	uint64_t     gen;    /* Liability generation number. Represents +				the current 'state' of liability. Every +				new addition to the liability list bumps +				the generation number. + +				a newly arrived request is only required +				to perform causal checks against the entries +				in the liability list which were present +				at the time of its addition. the generation +				number at the time of its addition is stored +				in the request and used during checks. + +				the liability list can grow while the request +				waits in the todo list waiting for its +				dependent operations to complete. however +				it is not of the request's concern to depend +				itself on those new entries which arrived +				after it arrived (i.e, those that have a +				liability generation higher than itself) +			     */          gf_lock_t    lock;          xlator_t    *this; -}wb_inode_t; - -typedef struct wb_file { -        int32_t           flags; -        int               disabled; -        fd_t             *fd; -        size_t            disable_till; -        enum _gf_boolean  dont_wind; -} wb_file_t; +} wb_inode_t;  typedef struct wb_request { -        list_head_t           list; +        list_head_t           all; +        list_head_t           todo; +	list_head_t           lie;  /* either in @liability or @temptation */          list_head_t           winds;          list_head_t           unwinds; -        list_head_t           other_requests; +          call_stub_t          *stub; -        size_t                write_size; + +        size_t                write_size;  /* currently held size +					      (after collapsing) */ +	size_t                orig_size;   /* size which arrived with the request. +					      This is the size by which we grow +					      the window when unwinding the frame. +					   */ +        size_t                total_size;  /* valid only in @head in wb_fulfill(). +					      This is the size with which we perform +					      STACK_WIND to server and therefore the +					      amount by which we shrink the window. +					   */ + +	int                   op_ret; +	int                   op_errno; +          int32_t               refcount;          wb_inode_t           *wb_inode;          glusterfs_fop_t       fop;          gf_lkowner_t          lk_owner; -        union { -                struct  { -                        char  write_behind; -                        char  stack_wound; -                        char  got_reply; -                        char  virgin; -                        char  flush_all;     /* while trying to sync to back-end, -                                             * don't wait till a data of size -                                             * equal to configured aggregate-size -                                             * is accumulated, instead sync -                                             * whatever data currently present in -                                             * request queue. -                                             */ - -                }write_request; - -                struct { -                        char marked_for_resume; -                }other_requests; -        }flags; +	struct iobref        *iobref; +	uint64_t              gen;  /* inode liability state at the time of +				       request arrival */ + +	fd_t                 *fd; +	struct { +		size_t        size;          /* 0 size == till infinity */ +		off_t         off; +		int           append:1;      /* offset is invalid. only one +						outstanding append at a time */ +		int           tempted:1;     /* true only for non-sync writes */ +		int           lied:1;        /* sin committed */ +		int           fulfilled:1;   /* got server acknowledgement */ +		int           go:1;          /* enough aggregating, good to go */ +	} ordering;  } wb_request_t; -struct wb_conf { + +typedef struct wb_conf {          uint64_t         aggregate_size;          uint64_t         window_size; -        uint64_t         disable_till; -        gf_boolean_t     enable_O_SYNC;          gf_boolean_t     flush_behind; -        gf_boolean_t     enable_trickling_writes; -}; +        gf_boolean_t     trickling_writes; +	gf_boolean_t     strict_write_ordering; +	gf_boolean_t     strict_O_DIRECT; +} wb_conf_t; -typedef struct wb_local { -        list_head_t     winds; -        int32_t         flags; -        fd_t           *fd; -        wb_request_t   *request; -        int             op_ret; -        int             op_errno; -        call_frame_t   *frame; -        int32_t         reply_count; -        wb_inode_t     *wb_inode; -} wb_local_t; - -typedef struct wb_conf wb_conf_t; -typedef struct wb_page wb_page_t; -int32_t -wb_process_queue (call_frame_t *frame, wb_inode_t *wb_inode); - -ssize_t -wb_sync (call_frame_t *frame, wb_inode_t *wb_inode, list_head_t *winds); +void +wb_process_queue (wb_inode_t *wb_inode); -ssize_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size, -                 char enable_trickling_writes);  wb_inode_t *  __wb_inode_ctx_get (xlator_t *this, inode_t *inode) @@ -156,37 +205,6 @@ out:  } -wb_file_t * -__wb_fd_ctx_get (xlator_t *this, fd_t *fd) -{ -        wb_file_t *wb_file = NULL; -        uint64_t   value   = 0; - -        __fd_ctx_get (fd, this, &value); -        wb_file = (wb_file_t *)(unsigned long)value; - -        return wb_file; -} - - -wb_file_t * -wb_fd_ctx_get (xlator_t *this, fd_t *fd) -{ -        wb_file_t *wb_file = NULL; - -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); -        GF_VALIDATE_OR_GOTO (this->name, fd, out); - -        LOCK (&fd->lock); -        { -                wb_file = __wb_fd_ctx_get (this, fd); -        } -        UNLOCK (&fd->lock); - -out: -        return wb_file; -} -  /*    Below is a succinct explanation of the code deciding whether two regions    overlap, from Pavan <tcp@gluster.com>. @@ -211,19 +229,26 @@ out:    }  */ -static inline char -wb_requests_overlap (wb_request_t *request1, wb_request_t *request2) +gf_boolean_t +wb_requests_overlap (wb_request_t *req1, wb_request_t *req2)  { -        off_t            r1_start   = 0, r1_end = 0, r2_start = 0, r2_end = 0; +        off_t            r1_start = 0; +	off_t            r1_end = 0; +	off_t            r2_start = 0; +	off_t            r2_end = 0;          enum _gf_boolean do_overlap = 0; -        r1_start = request1->stub->args.writev.off; -        r1_end = r1_start + iov_length (request1->stub->args.writev.vector, -                                        request1->stub->args.writev.count); +        r1_start = req1->ordering.off; +	if (req1->ordering.size) +		r1_end = r1_start + req1->ordering.size - 1; +	else +		r1_end = ULLONG_MAX; -        r2_start = request2->stub->args.writev.off; -        r2_end = r2_start + iov_length (request2->stub->args.writev.vector, -                                        request2->stub->args.writev.count); +        r2_start = req2->ordering.off; +	if (req2->ordering.size) +		r2_end = r2_start + req2->ordering.size - 1; +	else +		r2_end = ULLONG_MAX;          do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start)); @@ -231,72 +256,112 @@ wb_requests_overlap (wb_request_t *request1, wb_request_t *request2)  } -static inline char -wb_overlap (list_head_t *list, wb_request_t *request) +gf_boolean_t +wb_requests_conflict (wb_request_t *lie, wb_request_t *req)  { -        char          overlap = 0; -        wb_request_t *tmp     = NULL; +	wb_conf_t  *conf = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", list, out); -        GF_VALIDATE_OR_GOTO ("write-behind", request, out); +	conf = req->wb_inode->this->private; -        list_for_each_entry (tmp, list, list) { -                if (tmp == request) { -                        break; -                } +	if (lie == req) +		/* request cannot conflict with itself */ +		return _gf_false; -                overlap = wb_requests_overlap (tmp, request); -                if (overlap) { -                        break; -                } +	if (lie->gen >= req->gen) +		/* this liability entry was behind +		   us in the todo list */ +		return _gf_false; + +	if (lie->ordering.append) +		/* all modifications wait for the completion +		   of outstanding append */ +		return _gf_true; + +	if (conf->strict_write_ordering) +		/* We are sure (lie->gen < req->gen) by now. So +		   skip overlap check if strict write ordering is +		   requested and always return "conflict" against a +		   lower generation lie. */ +		return _gf_true; + +	return wb_requests_overlap (lie, req); +} + + +gf_boolean_t +wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) +{ +        wb_request_t *each     = NULL; + +        list_for_each_entry (each, &wb_inode->liability, lie) { +		if (wb_requests_conflict (each, req)) +			return _gf_true;          } -out: -        return overlap; +        return _gf_false;  }  static int -__wb_request_unref (wb_request_t *this) +__wb_request_unref (wb_request_t *req)  { -        int ret = -1; +        int         ret = -1; +	wb_inode_t *wb_inode = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); +	wb_inode = req->wb_inode; -        if (this->refcount <= 0) { +        if (req->refcount <= 0) {                  gf_log ("wb-request", GF_LOG_WARNING, -                        "refcount(%d) is <= 0", this->refcount); +                        "refcount(%d) is <= 0", req->refcount);                  goto out;          } -        ret = --this->refcount; -        if (this->refcount == 0) { -                list_del_init (&this->list); -                if (this->stub && this->stub->fop == GF_FOP_WRITE) { -                        call_stub_destroy (this->stub); -                } +        ret = --req->refcount; +        if (req->refcount == 0) { +                list_del_init (&req->todo); +                list_del_init (&req->lie); -                GF_FREE (this); -        } +		list_del_init (&req->all); +		if (list_empty (&wb_inode->all)) { +			wb_inode->gen = 0; +			/* in case of accounting errors? */ +			wb_inode->window_current = 0; +		} + +		list_del_init (&req->winds); +		list_del_init (&req->unwinds); + +                if (req->stub && req->ordering.tempted) { +                        call_stub_destroy (req->stub); +			req->stub = NULL; +                } /* else we would have call_resume()'ed */ + +		if (req->iobref) +			iobref_unref (req->iobref); +		if (req->fd) +			fd_unref (req->fd); + +                GF_FREE (req); +        }  out:          return ret;  }  static int -wb_request_unref (wb_request_t *this) +wb_request_unref (wb_request_t *req)  {          wb_inode_t *wb_inode = NULL;          int         ret      = -1; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); +        GF_VALIDATE_OR_GOTO ("write-behind", req, out); -        wb_inode = this->wb_inode; +        wb_inode = req->wb_inode;          LOCK (&wb_inode->lock);          { -                ret = __wb_request_unref (this); +                ret = __wb_request_unref (req);          }          UNLOCK (&wb_inode->lock); @@ -306,117 +371,155 @@ out:  static wb_request_t * -__wb_request_ref (wb_request_t *this) +__wb_request_ref (wb_request_t *req)  { -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); +        GF_VALIDATE_OR_GOTO ("write-behind", req, out); -        if (this->refcount < 0) { +        if (req->refcount < 0) {                  gf_log ("wb-request", GF_LOG_WARNING, -                        "refcount(%d) is < 0", this->refcount); -                this = NULL; +                        "refcount(%d) is < 0", req->refcount); +                req = NULL;                  goto out;          } -        this->refcount++; +        req->refcount++;  out: -        return this; +        return req;  }  wb_request_t * -wb_request_ref (wb_request_t *this) +wb_request_ref (wb_request_t *req)  {          wb_inode_t *wb_inode = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); +        GF_VALIDATE_OR_GOTO ("write-behind", req, out); -        wb_inode = this->wb_inode; +        wb_inode = req->wb_inode;          LOCK (&wb_inode->lock);          { -                this = __wb_request_ref (this); +                req = __wb_request_ref (req);          }          UNLOCK (&wb_inode->lock);  out: -        return this; +        return req;  } -wb_request_t * -wb_enqueue (wb_inode_t *wb_inode, call_stub_t *stub) +gf_boolean_t +wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)  { -        wb_request_t *request = NULL, *tmp = NULL; -        call_frame_t *frame   = NULL; -        wb_local_t   *local   = NULL; -        struct iovec *vector  = NULL; -        int32_t       count   = 0; +        wb_request_t *req     = NULL;          GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out);          GF_VALIDATE_OR_GOTO (wb_inode->this->name, stub, out); -        request = GF_CALLOC (1, sizeof (*request), gf_wb_mt_wb_request_t); -        if (request == NULL) { +        req = GF_CALLOC (1, sizeof (*req), gf_wb_mt_wb_request_t); +        if (!req)                  goto out; -        } -        INIT_LIST_HEAD (&request->list); -        INIT_LIST_HEAD (&request->winds); -        INIT_LIST_HEAD (&request->unwinds); -        INIT_LIST_HEAD (&request->other_requests); +        INIT_LIST_HEAD (&req->all); +        INIT_LIST_HEAD (&req->todo); +        INIT_LIST_HEAD (&req->lie); +        INIT_LIST_HEAD (&req->winds); +        INIT_LIST_HEAD (&req->unwinds); -        request->stub = stub; -        request->wb_inode = wb_inode; -        request->fop  = stub->fop; - -        frame = stub->frame; -        local = frame->local; -        if (local) { -                local->request = request; -        } +        req->stub = stub; +        req->wb_inode = wb_inode; +        req->fop  = stub->fop; +	req->ordering.tempted = tempted;          if (stub->fop == GF_FOP_WRITE) { -                vector = stub->args.writev.vector; -                count = stub->args.writev.count; +                req->write_size = iov_length (stub->args.writev.vector, +					      stub->args.writev.count); -                request->write_size = iov_length (vector, count); -                if (local) { -                        local->op_ret = request->write_size; -                        local->op_errno = 0; -                } +		/* req->write_size can change as we collapse +		   small writes. But the window needs to grow +		   only by how much we acknowledge the app. so +		   copy the original size in orig_size for the +		   purpose of accounting. +		*/ +		req->orig_size = req->write_size; + +		/* Let's be optimistic that we can +		   lie about it +		*/ +		req->op_ret = req->write_size; +		req->op_errno = 0; -                request->flags.write_request.virgin = 1; +		if (stub->args.writev.fd->flags & O_APPEND) +			req->ordering.append = 1;          } -        request->lk_owner = frame->root->lk_owner; +        req->lk_owner = stub->frame->root->lk_owner; + +	switch (stub->fop) { +	case GF_FOP_WRITE: +		req->ordering.off = stub->args.writev.off; +		req->ordering.size = req->write_size; + +		req->fd = fd_ref (stub->args.writev.fd); + +		break; +	case GF_FOP_READ: +		req->ordering.off = stub->args.readv.off; +		req->ordering.size = stub->args.readv.size; + +		req->fd = fd_ref (stub->args.readv.fd); + +		break; +	case GF_FOP_TRUNCATE: +		req->ordering.off = stub->args.truncate.off; +		req->ordering.size = 0; /* till infinity */ +		break; +	case GF_FOP_FTRUNCATE: +		req->ordering.off = stub->args.ftruncate.off; +		req->ordering.size = 0; /* till infinity */ + +		req->fd = fd_ref (stub->args.ftruncate.fd); + +		break; +	default: +		break; +	}          LOCK (&wb_inode->lock);          { -                list_add_tail (&request->list, &wb_inode->request); -                if (stub->fop == GF_FOP_WRITE) { -                        /* reference for stack winding */ -                        __wb_request_ref (request); - -                        /* reference for stack unwinding */ -                        __wb_request_ref (request); - -                        wb_inode->aggregate_current += request->write_size; -                } else { -                        list_for_each_entry (tmp, &wb_inode->request, list) { -                                if (tmp->stub && tmp->stub->fop -                                    == GF_FOP_WRITE) { -                                        tmp->flags.write_request.flush_all = 1; -                                } -                        } - -                        /*reference for resuming */ -                        __wb_request_ref (request); -                } +                list_add_tail (&req->all, &wb_inode->all); + +		req->gen = wb_inode->gen; + +                list_add_tail (&req->todo, &wb_inode->todo); +		__wb_request_ref (req); /* for wind */ + +		if (req->ordering.tempted) { +			list_add_tail (&req->lie, &wb_inode->temptation); +			__wb_request_ref (req); /* for unwind */ +		}          }          UNLOCK (&wb_inode->lock);  out: -        return request; +	if (!req) +		return _gf_false; + +	return _gf_true; +} + + +gf_boolean_t +wb_enqueue (wb_inode_t *wb_inode, call_stub_t *stub) +{ +	return wb_enqueue_common (wb_inode, stub, 0); +} + + +gf_boolean_t +wb_enqueue_tempted (wb_inode_t *wb_inode, call_stub_t *stub) +{ +	return wb_enqueue_common (wb_inode, stub, 1);  } @@ -426,18 +529,18 @@ __wb_inode_create (xlator_t *this, inode_t *inode)          wb_inode_t *wb_inode = NULL;          wb_conf_t  *conf     = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out);          GF_VALIDATE_OR_GOTO (this->name, inode, out);          conf = this->private;          wb_inode = GF_CALLOC (1, sizeof (*wb_inode), gf_wb_mt_wb_inode_t); -        if (wb_inode == NULL) { +        if (!wb_inode)                  goto out; -        } -        INIT_LIST_HEAD (&wb_inode->request); -        INIT_LIST_HEAD (&wb_inode->passive_requests); +        INIT_LIST_HEAD (&wb_inode->all); +        INIT_LIST_HEAD (&wb_inode->todo); +        INIT_LIST_HEAD (&wb_inode->liability); +        INIT_LIST_HEAD (&wb_inode->temptation);          wb_inode->this = this; @@ -452,58 +555,18 @@ out:  } -wb_file_t * -wb_file_create (xlator_t *this, fd_t *fd, int32_t flags) -{ -        wb_file_t *file = NULL; -        wb_conf_t *conf = NULL; - -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); -        GF_VALIDATE_OR_GOTO (this->name, fd, out); - -        conf = this->private; - -        file = GF_CALLOC (1, sizeof (*file), gf_wb_mt_wb_file_t); -        if (file == NULL) { -                goto out; -        } - -        /* -          fd_ref() not required, file should never decide the existence of -          an fd -        */ -        file->fd= fd; -        /* If O_DIRECT then, we disable chaching */ -        if (((flags & O_DIRECT) == O_DIRECT) -            || ((flags & O_ACCMODE) == O_RDONLY) -            || (((flags & O_SYNC) == O_SYNC) -                && conf->enable_O_SYNC == _gf_true)) { -                file->disabled = 1; -        } - -        file->flags = flags; - -        fd_ctx_set (fd, this, (uint64_t)(unsigned long)file); - -out: -        return file; -} - -  wb_inode_t *  wb_inode_create (xlator_t *this, inode_t *inode)  {          wb_inode_t *wb_inode = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out);          GF_VALIDATE_OR_GOTO (this->name, inode, out);          LOCK (&inode->lock);          {                  wb_inode = __wb_inode_ctx_get (this, inode); -                if (wb_inode == NULL) { +                if (!wb_inode)                          wb_inode = __wb_inode_create (this, inode); -                }          }          UNLOCK (&inode->lock); @@ -524,2390 +587,1049 @@ out:  } -int32_t -wb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -               int32_t op_ret, int32_t op_errno, inode_t *inode, -               struct iatt *buf, dict_t *dict, struct iatt *postparent) +void +__wb_fulfill_request (wb_request_t *req)  { -        wb_inode_t *wb_inode = NULL; +	wb_inode_t *wb_inode = NULL; -        if (op_ret < 0) { -                goto unwind; -        } +	wb_inode = req->wb_inode; -        wb_inode = wb_inode_create (this, inode); -        if (wb_inode == NULL) { -                op_ret = -1; -                op_errno = ENOMEM; -        } +	req->ordering.fulfilled = 1; +	wb_inode->window_current -= req->total_size; +	wb_inode->transit -= req->total_size; -unwind: -        STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, -                             dict, postparent); +	if (!req->ordering.lied) { +		/* TODO: fail the req->frame with error if +		   necessary +		*/ +	} -        return 0; +	__wb_request_unref (req);  } -int32_t -wb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, -           dict_t *xdata) +void +wb_head_done (wb_request_t *head)  { -        STACK_WIND (frame, wb_lookup_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->lookup, loc, xdata); -        return 0; -} - +	wb_request_t *req = NULL; +	wb_request_t *tmp     = NULL; +	wb_inode_t   *wb_inode = NULL; -int32_t -wb_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -             int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, -             dict_t *xdata) -{ -        wb_local_t   *local             = NULL; -        list_head_t  *winds             = NULL; -        wb_inode_t   *wb_inode          = NULL; -        wb_request_t *request           = NULL, *dummy = NULL; -        wb_local_t   *per_request_local = NULL; -        int32_t       ret               = -1; -	int32_t	      total_write_size	= 0; -        fd_t         *fd                = NULL; +	wb_inode = head->wb_inode; -        GF_ASSERT (frame); -        GF_ASSERT (this); +	LOCK (&wb_inode->lock); +	{ +		list_for_each_entry_safe (req, tmp, &head->winds, winds) { +			__wb_fulfill_request (req); +		} +		__wb_fulfill_request (head); +	} +	UNLOCK (&wb_inode->lock); +} -        local = frame->local; -        winds = &local->winds; -        fd = local->fd; +void +wb_inode_err (wb_inode_t *wb_inode, int op_errno) +{ +	LOCK (&wb_inode->lock); +	{ +		wb_inode->op_ret = -1; +		wb_inode->op_errno = op_errno; +	} +	UNLOCK (&wb_inode->lock); +} -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        GF_VALIDATE_OR_GOTO (this->name, wb_inode, out); -        LOCK (&wb_inode->lock); -        { -                list_for_each_entry_safe (request, dummy, winds, winds) { -                        request->flags.write_request.got_reply = 1; - -                        if (!request->flags.write_request.write_behind -                            && (op_ret == -1)) { -                                per_request_local = request->stub->frame->local; -                                per_request_local->op_ret = op_ret; -                                per_request_local->op_errno = op_errno; -                        } - -                        if (request->flags.write_request.write_behind) { -                                wb_inode->window_current -= request->write_size; -				total_write_size += request->write_size; -                        } - -                        __wb_request_unref (request); -                } +int +wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +		struct iatt *postbuf, dict_t *xdata) +{ +        wb_inode_t   *wb_inode   = NULL; +        wb_request_t *head       = NULL; -                if (op_ret == -1) { -                        wb_inode->op_ret = op_ret; -                        wb_inode->op_errno = op_errno; -		} else if (op_ret < total_write_size) { -			/* -			 * We've encountered a short write, for whatever reason. -			 * Set an EIO error for the next fop. This should be -			 * valid for writev or flush (close). -			 * -			 * TODO: Retry the write so we can potentially capture -			 * a real error condition (i.e., ENOSPC). -			 */ -			wb_inode->op_ret = -1; -			wb_inode->op_errno = EIO; -		} -        } -        UNLOCK (&wb_inode->lock); +	head = frame->local; +	frame->local = NULL; -        ret = wb_process_queue (frame, wb_inode); -        if (ret == -1) { -                if (errno == ENOMEM) { -                        LOCK (&wb_inode->lock); -                        { -                                wb_inode->op_ret = -1; -                                wb_inode->op_errno = ENOMEM; -                        } -                        UNLOCK (&wb_inode->lock); -                } +        wb_inode = head->wb_inode; -                gf_log (this->name, GF_LOG_WARNING, -                        "request queue processing failed"); -        } +	if (op_ret == -1) { +		wb_inode_err (wb_inode, op_errno); +	} else if (op_ret < head->total_size) { +		/* +		 * We've encountered a short write, for whatever reason. +		 * Set an EIO error for the next fop. This should be +		 * valid for writev or flush (close). +		 * +		 * TODO: Retry the write so we can potentially capture +		 * a real error condition (i.e., ENOSPC). +		 */ +		wb_inode_err (wb_inode, EIO); +	} -        /* safe place to do fd_unref */ -        fd_unref (fd); +	wb_head_done (head); -        frame->local = NULL; - -        if (local != NULL) { -                mem_put (frame->local); -        } +        wb_process_queue (wb_inode);          STACK_DESTROY (frame->root); -out:          return 0;  } -ssize_t -wb_sync (call_frame_t *frame, wb_inode_t *wb_inode, list_head_t *winds) -{ -        wb_request_t  *dummy                = NULL, *request = NULL; -        wb_request_t  *first_request        = NULL, *next = NULL; -        size_t         total_count          = 0, count = 0; -        size_t         copied               = 0; -        call_frame_t  *sync_frame           = NULL; -        struct iobref *iobref               = NULL; -        wb_local_t    *local                = NULL; -        struct iovec  *vector               = NULL; -        ssize_t        current_size         = 0, bytes = 0; -        size_t         bytecount            = 0; -        wb_conf_t     *conf                 = NULL; -        fd_t          *fd                   = NULL; -        int32_t        op_errno             = -1; -        off_t          next_offset_expected = 0; -        gf_lkowner_t   lk_owner             = {0, }; - -        GF_VALIDATE_OR_GOTO_WITH_ERROR ((wb_inode ? wb_inode->this->name -                                         : "write-behind"), frame, -                                        out, bytes, -1); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, wb_inode, out, bytes, -                                        -1); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, winds, out, bytes, -                                        -1); - -        conf = wb_inode->this->private; -        list_for_each_entry (request, winds, winds) { -                total_count += request->stub->args.writev.count; -                if (total_count > 0) { -                        break; -                } -        } +#define WB_IOV_LOAD(vec, cnt, req, head) do {				\ +		memcpy (&vec[cnt], req->stub->args.writev.vector,	\ +			(req->stub->args.writev.count * sizeof(vec[0]))); \ +		cnt += req->stub->args.writev.count;			\ +		head->total_size += req->write_size;			\ +	} while (0) -        if (total_count == 0) { -                gf_log (wb_inode->this->name, GF_LOG_TRACE, -                        "no vectors are to be synced"); -                goto out; -        } -        list_for_each_entry_safe (request, dummy, winds, winds) { -                if (!vector) { -                        vector = GF_MALLOC (VECTORSIZE (MAX_VECTOR_COUNT), -                                            gf_wb_mt_iovec); -                        if (vector == NULL) { -                                bytes = -1; -                                op_errno = ENOMEM; -                                goto out; -                        } - -                        iobref = iobref_new (); -                        if (iobref == NULL) { -                                bytes = -1; -                                op_errno = ENOMEM; -                                goto out; -                        } - -                        local = mem_get0 (THIS->local_pool); -                        if (local == NULL) { -                                bytes = -1; -                                op_errno = ENOMEM; -                                goto out; -                        } - -                        INIT_LIST_HEAD (&local->winds); - -                        first_request = request; -                        current_size = 0; - -                        next_offset_expected = request->stub->args.writev.off -                                + request->write_size; -                        lk_owner = request->lk_owner; -                } +void +wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) +{ +	struct iovec   vector[MAX_VECTOR_COUNT]; +	int            count = 0; +	wb_request_t  *req = NULL; +	call_frame_t  *frame = NULL; -                count += request->stub->args.writev.count; -                bytecount = VECTORSIZE (request->stub->args.writev.count); -                memcpy (((char *)vector)+copied, -                        request->stub->args.writev.vector, -                        bytecount); -                copied += bytecount; +	frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool); +	if (!frame) +		goto enomem; -                current_size += request->write_size; +	WB_IOV_LOAD (vector, count, head, head); -                if (request->stub->args.writev.iobref) { -                        iobref_merge (iobref, -                                      request->stub->args.writev.iobref); -                } +	list_for_each_entry (req, &head->winds, winds) { +		WB_IOV_LOAD (vector, count, req, head); -                next = NULL; -                if (request->winds.next != winds) { -                        next = list_entry (request->winds.next, -                                           wb_request_t, winds); -                } +		iobref_merge (head->stub->args.writev.iobref, +			      req->stub->args.writev.iobref); +	} -                list_del_init (&request->winds); -                list_add_tail (&request->winds, &local->winds); - -                if ((!next) -                    || ((count + next->stub->args.writev.count) -                        > MAX_VECTOR_COUNT) -                    || ((current_size + next->write_size) -                        > conf->aggregate_size) -                    || (next_offset_expected != next->stub->args.writev.off) -                    || (!is_same_lkowner (&lk_owner, &next->lk_owner)) -                    || (request->stub->args.writev.fd -                        != next->stub->args.writev.fd)) { - -                        sync_frame = copy_frame (frame); -                        if (sync_frame == NULL) { -                                bytes = -1; -                                op_errno = ENOMEM; -                                goto out; -                        } - -                        frame->root->lk_owner = lk_owner; - -                        local->wb_inode = wb_inode; -                        sync_frame->local = local; - -                        local->fd = fd = fd_ref (request->stub->args.writev.fd); - -                        bytes += current_size; -                        STACK_WIND (sync_frame, wb_sync_cbk, -                                    FIRST_CHILD(sync_frame->this), -                                    FIRST_CHILD(sync_frame->this)->fops->writev, -                                    fd, vector, count, -                                    first_request->stub->args.writev.off, -                                    first_request->stub->args.writev.flags, -                                    iobref, NULL); +	frame->root->lk_owner = head->lk_owner; +	frame->local = head; -                        iobref_unref (iobref); -                        GF_FREE (vector); -                        first_request = NULL; -                        iobref = NULL; -                        vector = NULL; -                        sync_frame = NULL; -                        local = NULL; -                        copied = count = 0; -                } -        } +	LOCK (&wb_inode->lock); +	{ +		wb_inode->transit += head->total_size; +	} +	UNLOCK (&wb_inode->lock); -out: -        if (sync_frame != NULL) { -                sync_frame->local = NULL; -                STACK_DESTROY (sync_frame->root); -        } +	STACK_WIND (frame, wb_fulfill_cbk, FIRST_CHILD (frame->this), +		    FIRST_CHILD (frame->this)->fops->writev, +		    head->fd, vector, count, +		    head->stub->args.writev.off, +		    head->stub->args.writev.flags, +		    head->stub->args.writev.iobref, NULL); -        if (local != NULL) { -                /* had we winded these requests, we would have unrefed -                 * in wb_sync_cbk. -                 */ -                list_for_each_entry_safe (request, dummy, &local->winds, -                                          winds) { -                        wb_request_unref (request); -                } +	return; +enomem: +	wb_inode_err (wb_inode, ENOMEM); -                mem_put (local); -                local = NULL; -        } +	wb_head_done (head); -        if (iobref != NULL) { -                iobref_unref (iobref); -        } +	return; +} -        GF_FREE (vector); - -        if (bytes == -1) { -                /* -                 * had we winded these requests, we would have unrefed -                 * in wb_sync_cbk. -                 */ -                if (local) { -                        list_for_each_entry_safe (request, dummy, &local->winds, -                                                  winds) { -                                wb_request_unref (request); -                        } -                } -                if (wb_inode != NULL) { -                        LOCK (&wb_inode->lock); -                        { -                                wb_inode->op_ret = -1; -                                wb_inode->op_errno = op_errno; -                        } -                        UNLOCK (&wb_inode->lock); -                } -        } - -        return bytes; -} +#define NEXT_HEAD(head, req) do {					\ +		if (head)						\ +			wb_fulfill_head (wb_inode, head);		\ +		head = req;						\ +		expected_offset = req->stub->args.writev.off +		\ +			req->write_size;				\ +		curr_aggregate = 0;					\ +		vector_count = 0;					\ +	} while (0) -int32_t -wb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -             int32_t op_errno, struct iatt *buf, dict_t *xdata) +void +wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)  { -        wb_local_t   *local         = NULL; -        wb_request_t *request       = NULL; -        call_frame_t *process_frame = NULL; -        wb_inode_t   *wb_inode      = NULL; -        int32_t       ret           = -1; - -        GF_ASSERT (frame); -        GF_ASSERT (this); +	wb_request_t  *req     = NULL; +	wb_request_t  *head    = NULL; +	wb_request_t  *tmp     = NULL; +	wb_conf_t     *conf    = NULL; +	off_t          expected_offset = 0; +	size_t         curr_aggregate = 0; +	size_t         vector_count = 0; -        local = frame->local; -        wb_inode = local->wb_inode; +	conf = wb_inode->this->private; -        request = local->request; -        if (request) { -                process_frame = copy_frame (frame); -                if (process_frame == NULL) { -                        op_ret = -1; -                        op_errno = ENOMEM; -                } -        } +	list_for_each_entry_safe (req, tmp, liabilities, winds) { +		list_del_init (&req->winds); -        STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); +		if (!head) { +			NEXT_HEAD (head, req); +			continue; +		} -        if (request != NULL) { -                wb_request_unref (request); -        } +		if (req->fd != head->fd) { +			NEXT_HEAD (head, req); +			continue; +		} -        if (process_frame != NULL) { -                ret = wb_process_queue (process_frame, wb_inode); -                if (ret == -1) { -                        if ((errno == ENOMEM) && (wb_inode != NULL)) { -                                LOCK (&wb_inode->lock); -                                { -                                        wb_inode->op_ret = -1; -                                        wb_inode->op_errno = ENOMEM; -                                } -                                UNLOCK (&wb_inode->lock); -                        } - -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } +		if (!is_same_lkowner (&req->lk_owner, &head->lk_owner)) { +			NEXT_HEAD (head, req); +			continue; +		} -                STACK_DESTROY (process_frame->root); -        } +		if (expected_offset != req->stub->args.writev.off) { +			NEXT_HEAD (head, req); +			continue; +		} -        return 0; -} +		if ((curr_aggregate + req->write_size) > conf->aggregate_size) { +			NEXT_HEAD (head, req); +			continue; +		} +		if (vector_count + req->stub->args.writev.count > +		    MAX_VECTOR_COUNT) { +			NEXT_HEAD (head, req); +			continue; +		} -static int32_t -wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ -        GF_ASSERT (frame); -        GF_ASSERT (this); +		list_add_tail (&req->winds, &head->winds); +		curr_aggregate += req->write_size; +		vector_count += req->stub->args.writev.count; +	} -        STACK_WIND (frame, wb_stat_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->stat, loc, xdata); -        return 0; +	if (head) +		wb_fulfill_head (wb_inode, head); +	return;  } -int32_t -wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +void +wb_do_unwinds (wb_inode_t *wb_inode, list_head_t *lies)  { -        wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1, op_errno = EINVAL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - -        if (loc->inode) { -                wb_inode = wb_inode_ctx_get (this, loc->inode); -        } - -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        local->wb_inode = wb_inode; - -        frame->local = local; - -        if (wb_inode) { -                stub = fop_stat_stub (frame, wb_stat_helper, loc, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +	wb_request_t *req = NULL; +	wb_request_t *tmp = NULL; +	call_frame_t *frame = NULL; +	struct iatt   buf = {0, }; -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_stat_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->stat, loc, xdata); -        } +        list_for_each_entry_safe (req, tmp, lies, unwinds) { +                frame = req->stub->frame; -        return 0; -unwind: -        STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL, NULL); +                STACK_UNWIND_STRICT (writev, frame, req->op_ret, req->op_errno, +				     &buf, &buf, NULL); /* :O */ +		req->stub->frame = NULL; -        if (stub) { -                call_stub_destroy (stub); +		list_del_init (&req->unwinds); +                wb_request_unref (req);          } -        return 0; +        return;  } -int32_t -wb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -              int32_t op_errno, struct iatt *buf, dict_t *xdata) +void +__wb_pick_unwinds (wb_inode_t *wb_inode, list_head_t *lies)  { -        wb_local_t   *local    = NULL; -        wb_request_t *request  = NULL; -        wb_inode_t   *wb_inode = NULL; -        int32_t       ret      = -1; - -        GF_ASSERT (frame); - -        local = frame->local; -        wb_inode = local->wb_inode; - -        request = local->request; -        if ((wb_inode != NULL) && (request != NULL)) { -                wb_request_unref (request); -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        if (errno == ENOMEM) { -                                op_ret = -1; -                                op_errno = ENOMEM; -                        } - -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } - -        STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); - -        return 0; -} +        wb_request_t *req = NULL; +        wb_request_t *tmp = NULL; +	list_for_each_entry_safe (req, tmp, &wb_inode->temptation, lie) { +		if (!req->ordering.fulfilled && +		    wb_inode->window_current > wb_inode->window_conf) +			continue; -int32_t -wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ -        GF_ASSERT (frame); -        GF_ASSERT (this); - -        STACK_WIND (frame, wb_fstat_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->fstat, fd, xdata); -        return 0; -} +		list_del_init (&req->lie); +		list_move_tail (&req->unwinds, lies); +		wb_inode->window_current += req->orig_size; -int32_t -wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ -        wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1; -        int           op_errno     = EINVAL; +		if (!req->ordering.fulfilled) { +			/* burden increased */ +			list_add_tail (&req->lie, &wb_inode->liability); -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); +			req->ordering.lied = 1; -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        if ((!IA_ISDIR (fd->inode->ia_type)) && (wb_inode == NULL)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "wb_inode not found for fd %p", fd); -                op_errno = EBADFD; -                goto unwind; -        } +			wb_inode->gen++; +		} +	} -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +        return; +} -        local->wb_inode = wb_inode; -        frame->local = local; +int +__wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req) +{ +        char          *ptr    = NULL; +        struct iobuf  *iobuf  = NULL; +        struct iobref *iobref = NULL; +        int            ret    = -1; -        if (wb_inode) { -                stub = fop_fstat_stub (frame, wb_fstat_helper, fd, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; +        if (!holder->iobref) { +                /* TODO: check the required size */ +                iobuf = iobuf_get (req->wb_inode->this->ctx->iobuf_pool); +                if (iobuf == NULL) { +                        goto out;                  } -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; +                iobref = iobref_new (); +                if (iobref == NULL) { +                        iobuf_unref (iobuf); +                        goto out;                  } -                /* -                  FIXME:should the request queue be emptied in case of error? -                */ -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); +                ret = iobref_add (iobref, iobuf); +                if (ret != 0) { +                        iobuf_unref (iobuf); +                        iobref_unref (iobref); +                        gf_log (req->wb_inode->this->name, GF_LOG_WARNING, +                                "cannot add iobuf (%p) into iobref (%p)", +                                iobuf, iobref); +                        goto out;                  } -        } else { -                STACK_WIND (frame, wb_fstat_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->fstat, fd, xdata); -        } - -        return 0; - -unwind: -        STACK_UNWIND_STRICT (fstat, frame, -1, op_errno, NULL, NULL); - -        if (stub) { -                call_stub_destroy (stub); -        } -        return 0; -} - - -int32_t -wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                 struct iatt *postbuf, dict_t *xdata) -{ -        wb_local_t   *local         = NULL; -        wb_request_t *request       = NULL; -        wb_inode_t   *wb_inode      = NULL; -        call_frame_t *process_frame = NULL; -        int32_t       ret           = -1; +                iov_unload (iobuf->ptr, holder->stub->args.writev.vector, +                            holder->stub->args.writev.count); +                holder->stub->args.writev.vector[0].iov_base = iobuf->ptr; +		holder->stub->args.writev.count = 1; -        GF_ASSERT (frame); +                iobref_unref (holder->stub->args.writev.iobref); +                holder->stub->args.writev.iobref = iobref; -        local = frame->local; -        wb_inode = local->wb_inode; -        request = local->request; +                iobuf_unref (iobuf); -        if ((request != NULL) && (wb_inode != NULL)) { -                process_frame = copy_frame (frame); -                if (process_frame == NULL) { -                        op_ret = -1; -                        op_errno = ENOMEM; -                } +                holder->iobref = iobref_ref (iobref);          } -        STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, -                             postbuf, xdata); - -        if (request) { -                wb_request_unref (request); -        } +        ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size; -        if (process_frame != NULL) { -                ret = wb_process_queue (process_frame, wb_inode); -                if (ret == -1) { -                        if ((errno == ENOMEM) && (wb_inode != NULL)) { -                                LOCK (&wb_inode->lock); -                                { -                                        wb_inode->op_ret = -1; -                                        wb_inode->op_errno = ENOMEM; -                                } -                                UNLOCK (&wb_inode->lock); -                        } - -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } +        iov_unload (ptr, req->stub->args.writev.vector, +                    req->stub->args.writev.count); -                STACK_DESTROY (process_frame->root); -        } +        holder->stub->args.writev.vector[0].iov_len += req->write_size; +        holder->write_size += req->write_size; +        holder->ordering.size += req->write_size; -        return 0; +        ret = 0; +out: +        return ret;  } -static int32_t -wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, -                    off_t offset, dict_t *xdata) +void +__wb_preprocess_winds (wb_inode_t *wb_inode)  { -        GF_ASSERT (frame); -        GF_ASSERT (this); - -        STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); +        off_t         offset_expected = 0; +        size_t        space_left      = 0; +	wb_request_t *req             = NULL; +	wb_request_t *tmp             = NULL; +	wb_request_t *holder          = NULL; +	wb_conf_t    *conf            = NULL; +        int           ret             = 0; +	size_t        page_size       = 0; + +	/* With asynchronous IO from a VM guest (as a file), there +	   can be two sequential writes happening in two regions +	   of the file. But individual (broken down) IO requests +	   can arrive interleaved. + +	   TODO: cycle for each such sequence sifting +	   through the interleaved ops +	*/ + +	page_size = wb_inode->this->ctx->page_size; +	conf = wb_inode->this->private; + +        list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { +		if (!req->ordering.tempted) { +			if (holder) { +				if (wb_requests_conflict (holder, req)) +					/* do not hold on write if a +					   dependent write is in queue */ +					holder->ordering.go = 1; +			} +			/* collapse only non-sync writes */ +			continue; +		} else if (!holder) { +			/* holder is always a non-sync write */ +			holder = req; +			continue; +		} -        return 0; -} +		offset_expected = holder->stub->args.writev.off +			+ holder->write_size; +		if (req->stub->args.writev.off != offset_expected) { +			holder->ordering.go = 1; +			holder = req; +			continue; +		} -int32_t -wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, -             dict_t *xdata) -{ -        wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1, op_errno = EINVAL; +		if (!is_same_lkowner (&req->lk_owner, &holder->lk_owner)) { +			holder->ordering.go = 1; +			holder = req; +			continue; +		} -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); +		space_left = page_size - holder->write_size; -        if (loc->inode) { -                wb_inode = wb_inode_ctx_get (this, loc->inode); -        } +		if (space_left < req->write_size) { +			holder->ordering.go = 1; +			holder = req; +			continue; +		} -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +		ret = __wb_collapse_small_writes (holder, req); +		if (ret) +			continue; -        local->wb_inode = wb_inode; +		/* collapsed request is as good as wound +		   (from its p.o.v) +		*/ +		list_del_init (&req->todo); +		__wb_fulfill_request (req); -        frame->local = local; -        if (wb_inode) { -                stub = fop_truncate_stub (frame, wb_truncate_helper, loc, -                                          offset, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +               /* Only the last @holder in queue which -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +                  - does not have any non-buffered-writes following it +                  - has not yet filled its capacity -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->truncate, loc, offset, -                            xdata); +                  does not get its 'go' set, in anticipation of the arrival +                  of consecutive smaller writes. +               */          } -        return 0; - -unwind: -        STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL, NULL); +	/* but if trickling writes are enabled, then do not hold back +	   writes if there are no outstanding requests +	*/ -        if (stub) { -                call_stub_destroy (stub); -        } +	if (conf->trickling_writes && !wb_inode->transit && holder) +		holder->ordering.go = 1; -        return 0; +        return;  } -int32_t -wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                  struct iatt *postbuf, dict_t *xdata) +void +__wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, +		 list_head_t *liabilities)  { -        wb_local_t   *local    = NULL; -        wb_request_t *request  = NULL; -        wb_inode_t   *wb_inode = NULL; -        int32_t       ret      = -1; - -        GF_ASSERT (frame); - -        local = frame->local; -        wb_inode = local->wb_inode; -        request = local->request; - -        if ((request != NULL) && (wb_inode != NULL)) { -                wb_request_unref (request); -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        if (errno == ENOMEM) { -                                op_ret = -1; -                                op_errno = ENOMEM; -                        } - -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } - -        STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, -                             postbuf, xdata); +	wb_request_t *req = NULL; +	wb_request_t *tmp = NULL; -        return 0; -} +	list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { +		if (wb_liability_has_conflict (wb_inode, req)) +			continue; +		if (req->ordering.tempted && !req->ordering.go) +			/* wait some more */ +			continue; -static int32_t -wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, -                     off_t offset, dict_t *xdata) -{ -        GF_ASSERT (frame); -        GF_ASSERT (this); +		list_del_init (&req->todo); -        STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); -        return 0; +		if (req->ordering.tempted) +			list_add_tail (&req->winds, liabilities); +		else +			list_add_tail (&req->winds, tasks); +	}  } -int32_t -wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, -              dict_t *xdata) +void +wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks)  { -        wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1; -        int           op_errno     = EINVAL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); - -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        if ((!IA_ISDIR (fd->inode->ia_type)) && (wb_inode == NULL)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "wb_inode not found for fd %p", fd); -                op_errno = EBADFD; -                goto unwind; -        } - -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        local->wb_inode = wb_inode; - -        frame->local = local; - -        if (wb_inode) { -                stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, -                                           offset, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); -        } - -        return 0; +	wb_request_t *req = NULL; +	wb_request_t *tmp = NULL; -unwind: -        STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); +	list_for_each_entry_safe (req, tmp, tasks, winds) { +		list_del_init (&req->winds); -        if (stub) { -                call_stub_destroy (stub); -        } +		call_resume (req->stub); -        return 0; +		wb_request_unref (req); +	}  } -int32_t -wb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                int32_t op_ret, int32_t op_errno, struct iatt *statpre, -                struct iatt *statpost, dict_t *xdata) +void +wb_process_queue (wb_inode_t *wb_inode)  { -        wb_local_t   *local         = NULL; -        wb_request_t *request       = NULL; -        call_frame_t *process_frame = NULL; -        wb_inode_t   *wb_inode      = NULL; -        int32_t       ret           = -1; +        list_head_t tasks  = {0, }; +	list_head_t lies = {0, }; +	list_head_t liabilities = {0, }; -        GF_ASSERT (frame); +        INIT_LIST_HEAD (&tasks); +        INIT_LIST_HEAD (&lies); +        INIT_LIST_HEAD (&liabilities); -        local = frame->local; -        wb_inode = local->wb_inode; -        request = local->request; +        LOCK (&wb_inode->lock); +        { +		__wb_preprocess_winds (wb_inode); -        if (request) { -                process_frame = copy_frame (frame); -                if (process_frame == NULL) { -                        op_ret = -1; -                        op_errno = ENOMEM; -                } -        } +		__wb_pick_winds (wb_inode, &tasks, &liabilities); -        STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre, -                             statpost, xdata); +                __wb_pick_unwinds (wb_inode, &lies); -        if (request) { -                wb_request_unref (request);          } +        UNLOCK (&wb_inode->lock); -        if (request && (process_frame != NULL)) { -                ret = wb_process_queue (process_frame, wb_inode); -                if (ret == -1) { -                        if ((errno == ENOMEM) && (wb_inode != NULL)) { -                                LOCK (&wb_inode->lock); -                                { -                                        wb_inode->op_ret = -1; -                                        wb_inode->op_errno = ENOMEM; -                                } -                                UNLOCK (&wb_inode->lock); -                        } - -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } +	wb_do_unwinds (wb_inode, &lies); -                STACK_DESTROY (process_frame->root); -        } +	wb_do_winds (wb_inode, &tasks); -        return 0; +	wb_fulfill (wb_inode, &liabilities); + +        return;  } -static int32_t -wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, -                   struct iatt *stbuf, int32_t valid, dict_t *xdata) +int +wb_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, +		  struct iovec *vector, int32_t count, off_t offset, +		  uint32_t flags, struct iobref *iobref, dict_t *xdata)  { -        GF_ASSERT (frame); -        GF_ASSERT (this); - -        STACK_WIND (frame, wb_setattr_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); -        return 0; +	STACK_WIND (frame, default_writev_cbk, +		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, +		    fd, vector, count, offset, flags, iobref, xdata); +	return 0;  } -int32_t -wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, -            struct iatt *stbuf, int32_t valid, dict_t *xdata) +int +wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, +           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, +           dict_t *xdata)  { -        wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1, op_errno = EINVAL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); +        wb_inode_t   *wb_inode      = NULL; +	wb_conf_t    *conf          = NULL; +        gf_boolean_t  wb_disabled   = 0; +        call_stub_t  *stub          = NULL; +        int           ret           = -1; +	int           op_errno      = EINVAL; +	int           o_direct      = O_DIRECT; + +	conf = this->private; +        wb_inode = wb_inode_create (this, fd->inode); +	if (!wb_inode) { +		op_errno = ENOMEM; +		goto unwind; +	} + +	if (!conf->strict_O_DIRECT) +		o_direct = 0; + +	if (fd->flags & (O_SYNC|O_DSYNC|o_direct)) +		wb_disabled = 1; + +	if (flags & (O_SYNC|O_DSYNC|O_DIRECT)) +		/* O_DIRECT flag in params of writev must _always_ be honored */ +		wb_disabled = 1; + +	op_errno = 0; +	LOCK (&wb_inode->lock); +	{ +		/* pick up a previous error in fulfillment */ +		if (wb_inode->op_ret < 0) +			op_errno = wb_inode->op_errno; + +		wb_inode->op_ret = 0; +	} +	UNLOCK (&wb_inode->lock); + +        if (op_errno) +                goto unwind; -        local = mem_get0 (this->local_pool); -        if (local == NULL) { +	if (wb_disabled) +		stub = fop_writev_stub (frame, wb_writev_helper, fd, vector, +					count, offset, flags, iobref, xdata); +	else +		stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, +					flags, iobref, xdata); +        if (!stub) {                  op_errno = ENOMEM;                  goto unwind;          } -        frame->local = local; - -        if (loc->inode) { -                wb_inode = wb_inode_ctx_get (this, loc->inode); -        } +	if (wb_disabled) +		ret = wb_enqueue (wb_inode, stub); +	else +		ret = wb_enqueue_tempted (wb_inode, stub); -        local->wb_inode = wb_inode; +	if (!ret) { +		op_errno = ENOMEM; +		goto unwind; +	} -        if (wb_inode) { -                stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, -                                         valid, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_setattr_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->setattr, loc, stbuf, -                            valid, xdata); -        } +        wb_process_queue (wb_inode);          return 0; +  unwind: -        STACK_UNWIND_STRICT (setattr, frame, -1, op_errno, NULL, NULL, NULL); +        STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); -        if (stub) { +        if (stub)                  call_stub_destroy (stub); -        }          return 0;  } -void -wb_disable_all (xlator_t *this, fd_t *origfd) -{ -        inode_t   *inode   = NULL; -        fd_t      *otherfd = NULL; -        wb_file_t *wb_file = NULL; - -        inode = origfd->inode; - -        LOCK(&inode->lock); -        { -                list_for_each_entry (otherfd, &inode->fd_list, inode_list) { -                        if (otherfd == origfd) { -                                continue; -                        } - -                        wb_file = wb_fd_ctx_get (this, otherfd); -                        if (wb_file == NULL) { -                                continue; -                        } - -                        gf_log(this->name,GF_LOG_DEBUG, -                               "disabling wb on %p because %p is O_SYNC", -                               otherfd, origfd); -                        wb_file->disabled = 1; -                } -        } -        UNLOCK(&inode->lock); -} -int32_t -wb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -             int32_t op_errno, fd_t *fd, dict_t *xdata) +int +wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +                 off_t offset, uint32_t flags, dict_t *xdata)  { -        int32_t     flags   = 0; -        wb_file_t  *file    = NULL; -        wb_local_t *local   = NULL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, out, op_errno, -                                        EINVAL); -        local = frame->local; -        GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, out, op_errno, -                                        EINVAL); - -        flags = local->flags; - -        if (op_ret != -1) { -                file = wb_file_create (this, fd, flags); -                if (file == NULL) { -                        op_ret = -1; -                        op_errno = ENOMEM; -                        goto out; -                } -        } - -out: -        frame->local = NULL; -        if (local != NULL) { -                mem_put (local); -        } - -        STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); +        STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, +                    xdata);          return 0;  } -int32_t -wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, -         fd_t *fd, dict_t *xdata) +int +wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +          off_t offset, uint32_t flags, dict_t *xdata)  { -        wb_local_t *local    = NULL; -        int32_t     op_errno = EINVAL; +        wb_inode_t   *wb_inode     = NULL; +        call_stub_t  *stub         = NULL; -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +        wb_inode = wb_inode_ctx_get (this, fd->inode); +	if (!wb_inode) +		goto noqueue; -        local->flags = flags; +	stub = fop_readv_stub (frame, wb_readv_helper, fd, size, +			       offset, flags, xdata); +	if (!stub) +		goto unwind; -        frame->local = local; +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; + +	wb_process_queue (wb_inode); -        STACK_WIND (frame, wb_open_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);          return 0;  unwind: -        STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL, NULL); +        STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, +                             NULL);          return 0; -} - - -int32_t -wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -               int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, -               struct iatt *buf, struct iatt *preparent, -               struct iatt *postparent, dict_t *xdata) -{ -        long        flags    = 0; -        wb_inode_t *wb_inode = NULL; -        wb_file_t  *file     = NULL; -        wb_local_t *local    = NULL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, out, -                                        op_errno, EINVAL); - -        if (op_ret != -1) { -                if (frame->local) { -                        flags = (long) frame->local; -                } - -                file = wb_file_create (this, fd, flags); -                if (file == NULL) { -                        op_ret = -1; -                        op_errno = ENOMEM; -                        goto out; -                } - -                LOCK (&inode->lock); -                { -                        wb_inode = __wb_inode_create (this, inode); -                        if (wb_inode == NULL) { -                                op_ret = -1; -                                op_errno = ENOMEM; -                        } -                } -                UNLOCK (&inode->lock); -        } - -        frame->local = NULL; - -out: -        STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, -                             preparent, postparent, xdata); - -        if (local != NULL) { -                mem_put (local); -        } +noqueue: +        STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, +                    xdata);          return 0;  } -int32_t -wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, -           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +int +wb_flush_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        int32_t     op_errno = EINVAL; -        wb_local_t *local    = NULL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); -        GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind); - -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        local->flags = flags; - -        frame->local = local; - -        STACK_WIND (frame, wb_create_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->create, -                    loc, flags, mode, umask, fd, xdata); -        return 0; - -unwind: -        STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, NULL, NULL, -                             NULL, NULL, NULL); +        STACK_DESTROY (frame->root);          return 0;  } -/* Mark all the contiguous write requests for winding starting from head of - * request list. Stops marking at the first non-write request found. If - * file is opened with O_APPEND, make sure all the writes marked for winding - * will fit into a single write call to server. - */ -size_t -__wb_mark_wind_all (wb_inode_t *wb_inode, list_head_t *list, list_head_t *winds) +int +wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        wb_request_t     *request       = NULL, *prev_request = NULL; -        wb_file_t        *wb_file       = NULL, *prev_wb_file = NULL; -        wb_file_t        *last_wb_file  = NULL; -        size_t            size          = 0; -        char              first_request = 1, overlap = 0; -        wb_conf_t        *conf          = NULL; -        int               count         = 0; -        enum _gf_boolean  dont_wind_set = 0; - -        GF_VALIDATE_OR_GOTO ("write-behind", wb_inode, out); -        GF_VALIDATE_OR_GOTO (wb_inode->this->name, list, out); -        GF_VALIDATE_OR_GOTO (wb_inode->this->name, winds, out); - -        conf = wb_inode->this->private; - -        list_for_each_entry (request, list, list) -        { -                if ((request->stub == NULL) -                    || (request->stub->fop != GF_FOP_WRITE)) { -                        break; -                } - -                wb_file = wb_fd_ctx_get (wb_inode->this, -                                         request->stub->args.writev.fd); -                if (wb_file == NULL) { -                        gf_log (wb_inode->this->name, GF_LOG_WARNING, -                                "write behind wb_file pointer is" -                                " not stored in context of fd(%p)", -                                request->stub->args.writev.fd); -                        goto out; -                } - -                /* If write requests from two fds are interleaved, for -                 * each of them, we can only send first set of adjacent -                 * requests that are on same fd. This is because, fds -                 * with O_APPEND cannot have more than one write fop in -                 * progress while syncing, so that order is not messed -                 * up. Since we group adjacent requests with same fd into -                 * single write call whenever possible, we need the above said -                 * measure. -                 */ -                if ((prev_wb_file != NULL) && (prev_wb_file->flags & O_APPEND) -                    && (prev_request->stub->args.writev.fd -                        != request->stub->args.writev.fd) -                    && (!prev_wb_file->dont_wind)) { -                        prev_wb_file->dont_wind = 1; -                        dont_wind_set = 1; -                        last_wb_file = prev_wb_file; -                } - -                prev_request = request; -                prev_wb_file = wb_file; - -                if (!request->flags.write_request.stack_wound) { -                        if (first_request) { -                                first_request = 0; -                        } else { -                                overlap = wb_overlap (list, request); -                                if (overlap) { -                                        continue; -                                } -                        } - -                        if ((wb_file->flags & O_APPEND) -                            && (((size + request->write_size) -                                 > conf->aggregate_size) -                                || ((count + request->stub->args.writev.count) -                                    > MAX_VECTOR_COUNT) -                                || (wb_file->dont_wind))) { -                                continue; -                        } - -                        size += request->write_size; - -                        wb_inode->aggregate_current -= request->write_size; - -                        count += request->stub->args.writev.count; - -                        request->flags.write_request.stack_wound = 1; -                        list_add_tail (&request->winds, winds); -                } -        } - -out: -        if (wb_inode != NULL) { -                wb_inode->aggregate_current -= size; -        } - -        if (dont_wind_set && (list != NULL)) { -                list_for_each_entry (request, list, list) { -                        wb_file = wb_fd_ctx_get (wb_inode->this, -                                                 request->stub->args.writev.fd); -                        if (wb_file != NULL) { -                                wb_file->dont_wind = 0; -                        } - -                        if (wb_file == last_wb_file) { -                                break; -                        } -                } -        } - -        return size; -} - +        wb_conf_t    *conf        = NULL; +        wb_inode_t   *wb_inode    = NULL; +        call_frame_t *bg_frame    = NULL; +	int           op_errno    = 0; +	int           op_ret      = 0; -int32_t -__wb_can_wind (list_head_t *list, char *other_fop_in_queue, -               char *overlapping_writes, char *incomplete_writes, -               char *wind_all) -{ -        wb_request_t *request         = NULL; -        char          first_request   = 1; -        int32_t       ret             = -1; -        char          overlap         = 0; +        conf = this->private; -        GF_VALIDATE_OR_GOTO ("write-behind", list, out); +	wb_inode = wb_inode_ctx_get (this, fd->inode); +	if (!wb_inode) { +		op_ret = -1; +		op_errno = EINVAL; +		goto unwind; +	} -        list_for_each_entry (request, list, list) +        LOCK (&wb_inode->lock);          { -                if ((request->stub == NULL) -                    || (request->stub->fop != GF_FOP_WRITE)) { -                        if (request->stub && other_fop_in_queue) { -                                *other_fop_in_queue = 1; -                        } -                        break; -                } - -                if (request->flags.write_request.stack_wound -                    && !request->flags.write_request.got_reply -                    && (incomplete_writes != NULL)) { -                        *incomplete_writes = 1; -                        break; -                } +		if (wb_inode->op_ret < 0) { +			op_ret = -1; +			op_errno = wb_inode->op_errno; +		} -                if (!request->flags.write_request.stack_wound) { -                        if (first_request) { -                                char flush = 0; -                                first_request = 0; - -                                flush = request->flags.write_request.flush_all; -                                if (wind_all != NULL) { -                                        *wind_all = flush; -                                } -                        } - -                        overlap = wb_overlap (list, request); -                        if (overlap) { -                                if (overlapping_writes != NULL) { -                                        *overlapping_writes = 1; -                                } - -                                break; -                        } -                } +                wb_inode->op_ret = 0;          } +        UNLOCK (&wb_inode->lock); -        ret = 0; -out: -        return ret; -} +	if (op_errno) +		goto unwind; +        if (conf->flush_behind) +		goto flushbehind; -ssize_t -__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf, -                 char enable_trickling_writes) -{ -        size_t        size                   = 0; -        char          other_fop_in_queue     = 0; -        char          incomplete_writes      = 0; -        char          overlapping_writes     = 0; -        wb_request_t *request                = NULL; -        wb_inode_t   *wb_inode               = NULL; -        char          wind_all               = 0; -        int32_t       ret                    = 0; - -        GF_VALIDATE_OR_GOTO ("write-behind", list, out); -        GF_VALIDATE_OR_GOTO ("write-behind", winds, out); - -        if (list_empty (list)) { -                goto out; -        } +	STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), +		    FIRST_CHILD(this)->fops->flush, fd, xdata); +	return 0; -        request = list_entry (list->next, typeof (*request), list); -        wb_inode = request->wb_inode; +flushbehind: +	bg_frame = copy_frame (frame); +	if (!bg_frame) { +		op_ret = -1; +		op_errno = ENOMEM; +		goto unwind; +	} -        ret = __wb_can_wind (list, &other_fop_in_queue, -                             &overlapping_writes, &incomplete_writes, -                             &wind_all); -        if (ret == -1) { -                gf_log (wb_inode->this->name, GF_LOG_WARNING, -                        "cannot decide whether to wind or not"); -                goto out; -        } - -        if (!incomplete_writes && ((enable_trickling_writes) -                                   || (wind_all) || (overlapping_writes) -                                   || (other_fop_in_queue) -                                   || (wb_inode->aggregate_current -                                       >= aggregate_conf))) { -                size = __wb_mark_wind_all (wb_inode, list, winds); -        } +	STACK_WIND (bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this), +		    FIRST_CHILD(this)->fops->flush, fd, xdata); +	/* fall through */ +unwind: +        STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); -out: -        return size; +        return 0;  } -size_t -__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size) +int +wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        size_t        written_behind = 0; -        wb_request_t *request        = NULL; -        wb_inode_t   *wb_inode       = NULL; - -        if (list_empty (list)) { -                goto out; -        } - -        request = list_entry (list->next, typeof (*request), list); -        wb_inode = request->wb_inode; - -        list_for_each_entry (request, list, list) -        { -                if ((request->stub == NULL) -                    || (request->stub->fop != GF_FOP_WRITE)) { -                        continue; -                } - -                if (written_behind <= size) { -                        if (!request->flags.write_request.write_behind) { -                                written_behind += request->write_size; -                                request->flags.write_request.write_behind = 1; -                                list_add_tail (&request->unwinds, unwinds); - -                                if (!request->flags.write_request.got_reply) { -                                        wb_inode->window_current -                                                += request->write_size; -                                } -                        } -                } else { -                        break; -                } -        } +        wb_inode_t   *wb_inode     = NULL; +        call_stub_t  *stub         = NULL; -out: -        return written_behind; -} +        wb_inode = wb_inode_ctx_get (this, fd->inode); +	if (!wb_inode) +		goto noqueue; +	stub = fop_flush_stub (frame, wb_flush_helper, fd, xdata); +	if (!stub) +		goto unwind; -void -__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds) -{ -        wb_request_t *request  = NULL; -        wb_inode_t   *wb_inode = NULL; +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -        GF_VALIDATE_OR_GOTO ("write-behind", list, out); -        GF_VALIDATE_OR_GOTO ("write-behind", unwinds, out); +	wb_process_queue (wb_inode); -        if (list_empty (list)) { -                goto out; -        } +        return 0; -        request = list_entry (list->next, typeof (*request), list); -        wb_inode = request->wb_inode; +unwind: +        STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, NULL); -        if (wb_inode->window_current <= wb_inode->window_conf) { -                __wb_mark_unwind_till (list, unwinds, -                                       wb_inode->window_conf -                                       - wb_inode->window_current); -        } +        return 0; -out: -        return; +noqueue: +        STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->flush, fd, xdata); +        return 0;  } -uint32_t -__wb_get_other_requests (list_head_t *list, list_head_t *other_requests) -{ -        wb_request_t *request = NULL; -        uint32_t      count   = 0; - -        GF_VALIDATE_OR_GOTO ("write-behind", list, out); -        GF_VALIDATE_OR_GOTO ("write-behind", other_requests, out); - -        list_for_each_entry (request, list, list) { -                if ((request->stub == NULL) -                    || (request->stub->fop == GF_FOP_WRITE)) { -                        break; -                } - -                if (!request->flags.other_requests.marked_for_resume) { -                        request->flags.other_requests.marked_for_resume = 1; -                        list_add_tail (&request->other_requests, -                                       other_requests); -                        count++; -                } -        } -out: -        return count; -} - - -int32_t -wb_stack_unwind (list_head_t *unwinds) +int +wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, +                 int32_t datasync, dict_t *xdata)  { -        struct iatt   buf     = {0,}; -        wb_request_t *request = NULL, *dummy = NULL; -        call_frame_t *frame   = NULL; -        wb_local_t   *local   = NULL; -        int           ret     = 0, write_requests_removed = 0; - -        GF_VALIDATE_OR_GOTO ("write-behind", unwinds, out); - -        list_for_each_entry_safe (request, dummy, unwinds, unwinds) { -                frame = request->stub->frame; -                local = frame->local; - -                STACK_UNWIND (frame, local->op_ret, local->op_errno, -                              &buf, &buf, NULL, NULL); - -                ret = wb_request_unref (request); -                if (ret == 0) { -                        write_requests_removed++; -                } -        } - -out: -        return write_requests_removed; +        STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); +        return 0;  } -int32_t -wb_resume_other_requests (call_frame_t *frame, wb_inode_t *wb_inode, -                          list_head_t *other_requests) +int +wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +          dict_t *xdata)  { -        int32_t       ret          = -1; -        wb_request_t *request      = NULL, *dummy = NULL; -        int32_t       fops_removed = 0; -        char          wind         = 0; +        wb_inode_t   *wb_inode     = NULL;          call_stub_t  *stub         = NULL; -        GF_VALIDATE_OR_GOTO ((wb_inode ? wb_inode->this->name : "write-behind"), -                             frame, out); -        GF_VALIDATE_OR_GOTO (frame->this->name, wb_inode, out); -        GF_VALIDATE_OR_GOTO (frame->this->name, other_requests, out); - -        if (list_empty (other_requests)) { -                ret = 0; -                goto out; -        } - -        list_for_each_entry_safe (request, dummy, other_requests, -                                  other_requests) { -                wind = request->stub->wind; -                stub = request->stub; - -                LOCK (&wb_inode->lock); -                { -                        request->stub = NULL; -                } -                UNLOCK (&wb_inode->lock); - -                if (!wind) { -                        wb_request_unref (request); -                        fops_removed++; -                } - -                call_resume (stub); -        } - -        ret = 0; - -        if (fops_removed > 0) { -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (frame->this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } - -out: -        return ret; -} - - -int32_t -wb_do_ops (call_frame_t *frame, wb_inode_t *wb_inode, list_head_t *winds, -           list_head_t *unwinds, list_head_t *other_requests) -{ -        int32_t ret = -1, write_requests_removed = 0; +        wb_inode = wb_inode_ctx_get (this, fd->inode); +	if (!wb_inode) +		goto noqueue; -        GF_VALIDATE_OR_GOTO ((wb_inode ? wb_inode->this->name : "write-behind"), -                             frame, out); -        GF_VALIDATE_OR_GOTO (frame->this->name, wb_inode, out); +	stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync, xdata); +	if (!stub) +		goto unwind; -        ret = wb_stack_unwind (unwinds); +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -        write_requests_removed = ret; +	wb_process_queue (wb_inode); -        ret = wb_sync (frame, wb_inode, winds); -        if (ret == -1) { -                gf_log (frame->this->name, GF_LOG_WARNING, -                        "syncing of write requests failed"); -        } +        return 0; -        ret = wb_resume_other_requests (frame, wb_inode, other_requests); -        if (ret == -1) { -                gf_log (frame->this->name, GF_LOG_WARNING, -                        "cannot resume non-write requests in request queue"); -        } +unwind: +        STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); -        /* wb_stack_unwind does wb_request_unref after unwinding a write -         * request. Hence if a write-request was just freed in wb_stack_unwind, -         * we have to process request queue once again to unblock requests -         * blocked on the writes just unwound. -         */ -        if (write_requests_removed > 0) { -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (frame->this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } +        return 0; -out: -        return ret; +noqueue: +        STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); +        return 0;  } -inline int -__wb_copy_into_holder (wb_request_t *holder, wb_request_t *request) +int +wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)  { -        char          *ptr    = NULL; -        struct iobuf  *iobuf  = NULL; -        struct iobref *iobref = NULL; -        int            ret    = -1; - -        if (holder->flags.write_request.virgin) { -                /* TODO: check the required size */ -                iobuf = iobuf_get (request->wb_inode->this->ctx->iobuf_pool); -                if (iobuf == NULL) { -                        goto out; -                } - -                iobref = iobref_new (); -                if (iobref == NULL) { -                        iobuf_unref (iobuf); -                        goto out; -                } - -                ret = iobref_add (iobref, iobuf); -                if (ret != 0) { -                        iobuf_unref (iobuf); -                        iobref_unref (iobref); -                        gf_log (request->wb_inode->this->name, GF_LOG_WARNING, -                                "cannot add iobuf (%p) into iobref (%p)", -                                iobuf, iobref); -                        goto out; -                } - -                iov_unload (iobuf->ptr, holder->stub->args.writev.vector, -                            holder->stub->args.writev.count); -                holder->stub->args.writev.vector[0].iov_base = iobuf->ptr; - -                iobref_unref (holder->stub->args.writev.iobref); -                holder->stub->args.writev.iobref = iobref; - -                iobuf_unref (iobuf); - -                holder->flags.write_request.virgin = 0; -        } - -        ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size; - -        iov_unload (ptr, request->stub->args.writev.vector, -                    request->stub->args.writev.count); - -        holder->stub->args.writev.vector[0].iov_len += request->write_size; -        holder->write_size += request->write_size; - -        request->flags.write_request.stack_wound = 1; -        list_move_tail (&request->list, &request->wb_inode->passive_requests); - -        ret = 0; -out: -        return ret; +        STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->stat, loc, xdata); +        return 0;  } -/* this procedure assumes that write requests have only one vector to write */ -void -__wb_collapse_write_bufs (list_head_t *requests, size_t page_size) +int +wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)  { -        off_t         offset_expected = 0; -        size_t        space_left      = 0; -        wb_request_t *request         = NULL, *tmp = NULL, *holder = NULL; -        int           ret             = 0; - -        GF_VALIDATE_OR_GOTO ("write-behind", requests, out); - -        list_for_each_entry_safe (request, tmp, requests, list) { -                if ((request->stub == NULL) -                    || (request->stub->fop != GF_FOP_WRITE) -                    || (request->flags.write_request.stack_wound)) { -                        holder = NULL; -                        continue; -                } - -                if (request->flags.write_request.write_behind) { -                        if (holder == NULL) { -                                holder = request; -                                continue; -                        } - -                        offset_expected = holder->stub->args.writev.off -                                + holder->write_size; - -                        if ((request->stub->args.writev.off != offset_expected) -                            || (!is_same_lkowner (&request->lk_owner, -                                                  &holder->lk_owner)) -                            || (holder->stub->args.writev.fd -                                != request->stub->args.writev.fd)) { -                                holder = request; -                                continue; -                        } - -                        space_left = page_size - holder->write_size; - -                        if (space_left >= request->write_size) { -                                ret = __wb_copy_into_holder (holder, request); -                                if (ret != 0) { -                                        break; -                                } - -                                __wb_request_unref (request); -                        } else { -                                holder = request; -                        } -                } else { -                        break; -                } -        } - -out: -        return; -} +        wb_inode_t   *wb_inode     = NULL; +        call_stub_t  *stub         = NULL; -int32_t -wb_process_queue (call_frame_t *frame, wb_inode_t *wb_inode) -{ -        list_head_t winds  = {0, }, unwinds = {0, }, other_requests = {0, }; -        size_t      size   = 0; -        wb_conf_t  *conf   = NULL; -        uint32_t    count  = 0; -        int32_t     ret    = -1; +	wb_inode = wb_inode_ctx_get (this, loc->inode); +        if (!wb_inode) +		goto noqueue; -        INIT_LIST_HEAD (&winds); -        INIT_LIST_HEAD (&unwinds); -        INIT_LIST_HEAD (&other_requests); +	stub = fop_stat_stub (frame, wb_stat_helper, loc, xdata); +	if (!stub) +		goto unwind; -        GF_VALIDATE_OR_GOTO ((wb_inode ? wb_inode->this->name : "write-behind"), -                             frame, out); -        GF_VALIDATE_OR_GOTO (wb_inode->this->name, frame, out); +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -        conf = wb_inode->this->private; -        GF_VALIDATE_OR_GOTO (wb_inode->this->name, conf, out); +	wb_process_queue (wb_inode); -        size = conf->aggregate_size; -        LOCK (&wb_inode->lock); -        { -                /* -                 * make sure requests are marked for unwinding and adjacent -                 * contiguous write buffers (each of size less than that of -                 * an iobuf) are packed properly so that iobufs are filled to -                 * their maximum capacity, before calling __wb_mark_winds. -                 */ -                __wb_mark_unwinds (&wb_inode->request, &unwinds); - -                __wb_collapse_write_bufs (&wb_inode->request, -                                          wb_inode->this->ctx->page_size); - -                count = __wb_get_other_requests (&wb_inode->request, -                                                 &other_requests); - -                if (count == 0) { -                        __wb_mark_winds (&wb_inode->request, &winds, size, -                                         conf->enable_trickling_writes); -                } +        return 0; -        } -        UNLOCK (&wb_inode->lock); +unwind: +        STACK_UNWIND_STRICT (stat, frame, -1, ENOMEM, NULL, NULL); -        ret = wb_do_ops (frame, wb_inode, &winds, &unwinds, &other_requests); +        if (stub) +                call_stub_destroy (stub); +        return 0; -out: -        return ret; +noqueue: +        STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->stat, loc, xdata); +	return 0;  } -int32_t -wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -               int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -               struct iatt *postbuf, dict_t *xdata) +int +wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        GF_ASSERT (frame); - -        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, -                             xdata); +        STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fstat, fd, xdata);          return 0;  } -int32_t -wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, -           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, -           dict_t *xdata) +int +wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        wb_inode_t   *wb_inode      = NULL; -        wb_file_t    *wb_file       = NULL; -        char          wb_disabled   = 0; -        call_frame_t *process_frame = NULL; -        call_stub_t  *stub          = NULL; -        wb_local_t   *local         = NULL; -        wb_request_t *request       = NULL; -        int32_t       ret           = -1; -        size_t        size          = 0; -        int32_t       op_ret        = -1, op_errno = EINVAL; - -        GF_ASSERT (frame); - -        GF_VALIDATE_OR_GOTO_WITH_ERROR ("write-behind", this, unwind, op_errno, -                                        EINVAL); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, -                                        EINVAL); +        wb_inode_t   *wb_inode     = NULL; +	call_stub_t  *stub         = NULL; -        if (vector != NULL) { -                size = iov_length (vector, count); -        }          wb_inode = wb_inode_ctx_get (this, fd->inode); -        if ((!IA_ISDIR (fd->inode->ia_type)) && (wb_inode == NULL)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "write behind wb_inode pointer is" -                        " not stored in context of inode(%p), returning EBADFD", -                        fd->inode); -                op_errno = EBADFD; -                goto unwind; -        } - -        if (wb_file != NULL) { -                if (wb_file->disabled || wb_file->disable_till) { -                        if (size > wb_file->disable_till) { -                                wb_file->disable_till = 0; -                        } else { -                                wb_file->disable_till -= size; -                        } -                        wb_disabled = 1; -                } -        } else { -                wb_disabled = 1; -        } - -        if (wb_inode != NULL) { -                LOCK (&wb_inode->lock); -                { -                        op_ret = wb_inode->op_ret; -                        op_errno = wb_inode->op_errno; -                } -                UNLOCK (&wb_inode->lock); -        } - -        if (op_ret == -1) { -                goto unwind; -        } - -        if (wb_disabled) { -                STACK_WIND (frame, wb_writev_cbk, FIRST_CHILD (frame->this), -                            FIRST_CHILD (frame->this)->fops->writev, -                            fd, vector, count, offset, flags, iobref, xdata); -                return 0; -        } - -        process_frame = copy_frame (frame); -        if (process_frame == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +	if (!wb_inode) +		goto noqueue; -        frame->local = local; -        local->wb_inode = wb_inode; +	stub = fop_fstat_stub (frame, wb_fstat_helper, fd, xdata); +	if (!stub) +		goto unwind; -        stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, flags, -                                iobref, xdata); -        if (stub == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -        request = wb_enqueue (wb_inode, stub); -        if (request == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        ret = wb_process_queue (process_frame, wb_inode); -        if (ret == -1) { -                gf_log (this->name, GF_LOG_WARNING, -                        "request queue processing failed"); -        } - -        STACK_DESTROY (process_frame->root); +	wb_process_queue (wb_inode);          return 0;  unwind: -        local = frame->local; -        frame->local = NULL; -        mem_put (local); - -        STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, NULL); +        STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, NULL, NULL); -        if (process_frame) { -                STACK_DESTROY (process_frame->root); -        } - -        if (stub) { +        if (stub)                  call_stub_destroy (stub); -        } -          return 0; -} - - -int32_t -wb_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -              int32_t op_errno, struct iovec *vector, int32_t count, -              struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) -{ -        wb_local_t   *local    = NULL; -        wb_inode_t   *wb_inode = NULL; -        wb_request_t *request  = NULL; -        int32_t       ret      = 0; - -        GF_ASSERT (frame); - -        local = frame->local; -        wb_inode = local->wb_inode; -        request = local->request; - -        if ((request != NULL) && (wb_inode != NULL)) { -                wb_request_unref (request); - -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        if (errno == ENOMEM) { -                                op_ret = -1; -                                op_errno = ENOMEM; -                        } - -                        gf_log (frame->this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } - -        STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, -                             stbuf, iobref, xdata); +noqueue: +        STACK_WIND (frame, default_fstat_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fstat, fd, xdata);          return 0;  } -static int32_t -wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, -                 off_t offset, uint32_t flags, dict_t *xdata) +int +wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, +                    off_t offset, dict_t *xdata)  { -        STACK_WIND (frame, wb_readv_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, -                    xdata); - +        STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);          return 0;  } -int32_t -wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, -          off_t offset, uint32_t flags, dict_t *xdata) +int +wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, +             dict_t *xdata)  {          wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL; -        call_stub_t  *stub         = NULL; -        int32_t       ret          = -1, op_errno = 0; -        wb_request_t *request      = NULL; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, -                                        op_errno, EINVAL); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, -                                        EINVAL); - -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        if ((!IA_ISDIR (fd->inode->ia_type)) && (wb_inode == NULL)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "write behind wb_inode pointer is" -                        " not stored in context of inode(%p), returning " -                        "EBADFD", fd->inode); -                op_errno = EBADFD; -                goto unwind; -        } - -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } +	call_stub_t  *stub         = NULL; -        local->wb_inode = wb_inode; +	wb_inode = wb_inode_create (this, loc->inode); +	if (!wb_inode) +		goto unwind; -        frame->local = local; -        if (wb_inode) { -                stub = fop_readv_stub (frame, wb_readv_helper, fd, size, -                                       offset, flags, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +	stub = fop_truncate_stub (frame, wb_truncate_helper, loc, +				  offset, xdata); +	if (!stub) +		goto unwind; -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        call_stub_destroy (stub); -                        op_errno = ENOMEM; -                        goto unwind; -                } +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_readv_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->readv, -                            fd, size, offset, flags, xdata); -        } +	wb_process_queue (wb_inode);          return 0;  unwind: -        STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, -                             NULL); -        return 0; -} - - -int32_t -wb_ffr_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -               int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ -        STACK_DESTROY (frame->root); -        return 0; -} - +        STACK_UNWIND_STRICT (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); -int32_t -wb_ffr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -            int32_t op_errno, dict_t *xdata) -{ -        wb_local_t *local    = NULL; -        wb_inode_t *wb_inode = NULL; - -        GF_ASSERT (frame); - -        local = frame->local; -        wb_inode = local->wb_inode; - -        if (wb_inode != NULL) { -                LOCK (&wb_inode->lock); -                { -                        if (wb_inode->op_ret == -1) { -                                op_ret = wb_inode->op_ret; -                                op_errno = wb_inode->op_errno; - -                                wb_inode->op_ret = 0; -                        } -                } -                UNLOCK (&wb_inode->lock); -        } - -        STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); +        if (stub) +                call_stub_destroy (stub);          return 0;  } -int32_t -wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +int +wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, +                     off_t offset, dict_t *xdata)  { -        wb_conf_t    *conf        = NULL; -        wb_local_t   *local       = NULL; -        wb_inode_t   *wb_inode    = NULL; -        call_frame_t *flush_frame = NULL, *process_frame = NULL; -        int32_t       op_ret      = -1, op_errno = -1, ret = -1; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, -                                        op_errno, EINVAL); - -        conf = this->private; - -        local = frame->local; -        wb_inode = local->wb_inode; - -        LOCK (&wb_inode->lock); -        { -                op_ret = wb_inode->op_ret; -                op_errno = wb_inode->op_errno; -        } -        UNLOCK (&wb_inode->lock); - -        if (local && local->request) { -                process_frame = copy_frame (frame); -                if (process_frame == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                wb_request_unref (local->request); -        } - -        if (conf->flush_behind) { -                flush_frame = copy_frame (frame); -                if (flush_frame == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                STACK_WIND (flush_frame, wb_ffr_bg_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->flush, fd, xdata); -        } else { -                STACK_WIND (frame, wb_ffr_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->flush, fd, xdata); -        } - -        if (process_frame != NULL) { -                ret = wb_process_queue (process_frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } - -                STACK_DESTROY (process_frame->root); -        } - -        if (conf->flush_behind) { -                STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); -        } - -        return 0; - -unwind: -        STACK_UNWIND_STRICT (flush, frame, -1, op_errno, NULL); +        STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);          return 0;  } -int32_t -wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +int +wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +              dict_t *xdata)  { -        wb_conf_t    *conf         = NULL;          wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL;          call_stub_t  *stub         = NULL; -        call_frame_t *flush_frame  = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = 0, op_errno = 0; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, -                                        op_errno, EINVAL); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, fd, unwind, op_errno, -                                        EINVAL); - -        conf = this->private; - -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        if ((!IA_ISDIR (fd->inode->ia_type)) && (wb_inode == NULL)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "write behind wb_inode pointer is" -                        " not stored in context of inode(%p), " -                        "returning EBADFD", fd->inode); -                op_errno = EBADFD; -                goto unwind; -        } - -        if (wb_inode != NULL) { -                local = mem_get0 (this->local_pool); -                if (local == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } - -                local->wb_inode = wb_inode; -                frame->local = local; +        wb_inode = wb_inode_create (this, fd->inode); +	if (!wb_inode) +		goto unwind; -                stub = fop_flush_stub (frame, wb_flush_helper, fd, xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +	stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, +				   offset, xdata); +	if (!stub) +		goto unwind; -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        call_stub_destroy (stub); -                        op_errno = ENOMEM; -                        goto unwind; -                } +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                if (conf->flush_behind) { -                        flush_frame = copy_frame (frame); -                        if (flush_frame == NULL) { -                                op_errno = ENOMEM; -                                goto unwind; -                        } - -                        STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL); - -                        STACK_WIND (flush_frame, wb_ffr_bg_cbk, -                                    FIRST_CHILD(this), -                                    FIRST_CHILD(this)->fops->flush, fd, xdata); -                } else { -                        STACK_WIND (frame, wb_ffr_cbk, FIRST_CHILD(this), -                                    FIRST_CHILD(this)->fops->flush, fd, xdata); -                } -        } +	wb_process_queue (wb_inode);          return 0;  unwind: -        STACK_UNWIND_STRICT (flush, frame, -1, op_errno, NULL); -        return 0; -} - - -static int32_t -wb_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, -              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, -              dict_t *xdata) -{ -        wb_local_t   *local    = NULL; -        wb_inode_t   *wb_inode = NULL; -        wb_request_t *request  = NULL; -        int32_t       ret      = -1; - -        GF_ASSERT (frame); - -        local = frame->local; -        wb_inode = local->wb_inode; -        request = local->request; - -        if (wb_inode != NULL) { -                LOCK (&wb_inode->lock); -                { -                        if (wb_inode->op_ret == -1) { -                                op_ret = wb_inode->op_ret; -                                op_errno = wb_inode->op_errno; - -                                wb_inode->op_ret = 0; -                        } -                } -                UNLOCK (&wb_inode->lock); - -                if (request) { -                        wb_request_unref (request); -                        ret = wb_process_queue (frame, wb_inode); -                        if (ret == -1) { -                                if (errno == ENOMEM) { -                                        op_ret = -1; -                                        op_errno = ENOMEM; -                                } - -                                gf_log (this->name, GF_LOG_WARNING, -                                        "request queue processing failed"); -                        } -                } - -        } - -        STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, -                             xdata); +        STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); +        if (stub) +                call_stub_destroy (stub);          return 0;  } -static int32_t -wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, -                 int32_t datasync, dict_t *xdata) +int +wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, +                   struct iatt *stbuf, int32_t valid, dict_t *xdata)  { -        STACK_WIND (frame, wb_fsync_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); +        STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);          return 0;  } -int32_t -wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, -          dict_t *xdata) +int +wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +            struct iatt *stbuf, int32_t valid, dict_t *xdata)  {          wb_inode_t   *wb_inode     = NULL; -        wb_local_t   *local        = NULL;          call_stub_t  *stub         = NULL; -        wb_request_t *request      = NULL; -        int32_t       ret          = -1, op_errno = 0; - -        GF_ASSERT (frame); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, this, unwind, -                                        op_errno, EINVAL); -        GF_VALIDATE_OR_GOTO_WITH_ERROR (frame->this->name, fd, unwind, -                                        op_errno, EINVAL); -        wb_inode = wb_inode_ctx_get (this, fd->inode); -        if (wb_inode == NULL && (!IA_ISDIR (fd->inode->ia_type))) { -                gf_log (this->name, GF_LOG_WARNING, -                        "write behind wb_inode pointer is" -                        " not stored in context of inode(%p), " -                        "returning EBADFD", fd->inode); -                op_errno = EBADFD; -                goto unwind; -        } +	wb_inode = wb_inode_ctx_get (this, loc->inode); +	if (!wb_inode) +		goto noqueue; -        local = mem_get0 (this->local_pool); -        if (local == NULL) { -                op_errno = ENOMEM; -                goto unwind; -        } - -        frame->local = local; -        local->wb_inode = wb_inode; - -        if (wb_inode) { -                stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync, -                                       xdata); -                if (stub == NULL) { -                        op_errno = ENOMEM; -                        goto unwind; -                } +	stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, +                                         valid, xdata); +	if (!stub) +		goto unwind; -                request = wb_enqueue (wb_inode, stub); -                if (request == NULL) { -                        op_errno = ENOMEM; -                        call_stub_destroy (stub); -                        goto unwind; -                } +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -                ret = wb_process_queue (frame, wb_inode); -                if (ret == -1) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "request queue processing failed"); -                } -        } else { -                STACK_WIND (frame, wb_fsync_cbk, FIRST_CHILD(this), -                            FIRST_CHILD(this)->fops->fsync, fd, datasync, -                            xdata); -        } +	wb_process_queue (wb_inode);          return 0; -  unwind: -        STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL); -        return 0; -} - - -int32_t -wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                 int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, -                 dict_t *xdata) -{ -        gf_dirent_t *entry      = NULL; - -        if (op_ret <= 0) { -                goto unwind; -        } +        STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); -        list_for_each_entry (entry, &entries->list, list) { -                if (!entry->inode) -			continue; -                wb_inode_create (this, entry->inode); -        } +        if (stub) +                call_stub_destroy (stub); +	return 0; -unwind: -        STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata); +noqueue: +        STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);          return 0;  } -int32_t -wb_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, -             size_t size, off_t off, dict_t *xdata) +int +wb_fsetattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, +		    struct iatt *stbuf, int32_t valid, dict_t *xdata)  { -        STACK_WIND (frame, wb_readdirp_cbk, FIRST_CHILD(this), -                    FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); +        STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);          return 0;  } -int32_t -wb_release (xlator_t *this, fd_t *fd) +int +wb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +	     struct iatt *stbuf, int32_t valid, dict_t *xdata)  { -        uint64_t    wb_file_ptr = 0; -        wb_file_t  *wb_file      = NULL; +        wb_inode_t   *wb_inode     = NULL; +        call_stub_t  *stub         = NULL; -        GF_VALIDATE_OR_GOTO ("write-behind", this, out); -        GF_VALIDATE_OR_GOTO (this->name, fd, out); +	wb_inode = wb_inode_ctx_get (this, fd->inode); +	if (!wb_inode) +		goto noqueue; -        fd_ctx_del (fd, this, &wb_file_ptr); -        wb_file = (wb_file_t *)(long) wb_file_ptr; +	stub = fop_fsetattr_stub (frame, wb_fsetattr_helper, fd, stbuf, +				  valid, xdata); +	if (!stub) +		goto unwind; -        GF_FREE (wb_file); +	if (!wb_enqueue (wb_inode, stub)) +		goto unwind; -out: +	wb_process_queue (wb_inode); + +        return 0; +unwind: +        STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); + +        if (stub) +                call_stub_destroy (stub); +	return 0; + +noqueue: +        STACK_WIND (frame, default_fsetattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);          return 0;  } -int32_t +int  wb_forget (xlator_t *this, inode_t *inode)  {          uint64_t    tmp      = 0; @@ -2917,15 +1639,16 @@ wb_forget (xlator_t *this, inode_t *inode)          wb_inode = (wb_inode_t *)(long)tmp; -        if (wb_inode != NULL) { -                LOCK (&wb_inode->lock); -                { -                        GF_ASSERT (list_empty (&wb_inode->request)); -                } -                UNLOCK (&wb_inode->lock); +	if (!wb_inode) +		return 0; -                wb_inode_destroy (wb_inode); -        } +	LOCK (&wb_inode->lock); +	{ +		GF_ASSERT (list_empty (&wb_inode->todo)); +		GF_ASSERT (list_empty (&wb_inode->liability)); +		GF_ASSERT (list_empty (&wb_inode->temptation)); +	} +	UNLOCK (&wb_inode->lock);          return 0;  } @@ -2950,10 +1673,8 @@ wb_priv_dump (xlator_t *this)          gf_proc_dump_write ("aggregate_size", "%d", conf->aggregate_size);          gf_proc_dump_write ("window_size", "%d", conf->window_size); -        gf_proc_dump_write ("enable_O_SYNC", "%d", conf->enable_O_SYNC);          gf_proc_dump_write ("flush_behind", "%d", conf->flush_behind); -        gf_proc_dump_write ("enable_trickling_writes", "%d", -                            conf->enable_trickling_writes); +        gf_proc_dump_write ("trickling_writes", "%d", conf->trickling_writes);          ret = 0;  out: @@ -2962,48 +1683,45 @@ out:  void -__wb_dump_requests (struct list_head *head, char *prefix, char passive) +__wb_dump_requests (struct list_head *head, char *prefix)  {          char          key[GF_DUMP_MAX_BUF_LEN]        = {0, };          char          key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }, flag = 0; -        wb_request_t *request                         = NULL; +        wb_request_t *req                             = NULL; -        list_for_each_entry (request, head, list) { -                gf_proc_dump_build_key (key, prefix, passive ? "passive-request" -                                        : "active-request"); +        list_for_each_entry (req, head, all) {                  gf_proc_dump_build_key (key_prefix, key, -                                        (char *)gf_fop_list[request->fop]); +                                        (char *)gf_fop_list[req->fop]);                  gf_proc_dump_add_section(key_prefix); -                gf_proc_dump_write ("request-ptr", "%p", request); +                gf_proc_dump_write ("request-ptr", "%p", req); -                gf_proc_dump_write ("refcount", "%d", request->refcount); +                gf_proc_dump_write ("refcount", "%d", req->refcount); -                if (request->fop == GF_FOP_WRITE) { -                        flag = request->flags.write_request.stack_wound; -                        gf_proc_dump_write ("stack_wound", "%d", flag); +		if (list_empty (&req->todo)) +			gf_proc_dump_write ("wound", "yes"); +		else +			gf_proc_dump_write ("wound", "no"); +                if (req->fop == GF_FOP_WRITE) {                          gf_proc_dump_write ("size", "%"GF_PRI_SIZET, -                                            request->write_size); +                                            req->write_size);                          gf_proc_dump_write ("offset", "%"PRId64, -                                            request->stub->args.writev.off); +                                            req->stub->args.writev.off); -                        flag = request->flags.write_request.write_behind; -                        gf_proc_dump_write ("write_behind", "%d", flag); +                        flag = req->ordering.lied; +                        gf_proc_dump_write ("lied", "%d", flag); -                        flag = request->flags.write_request.got_reply; -                        gf_proc_dump_write ("got_reply", "%d", flag); +                        flag = req->ordering.append; +                        gf_proc_dump_write ("append", "%d", flag); -                        flag = request->flags.write_request.virgin; -                        gf_proc_dump_write ("virgin", "%d", flag); +                        flag = req->ordering.fulfilled; +                        gf_proc_dump_write ("fulfilled", "%d", flag); -                        flag = request->flags.write_request.flush_all; -                        gf_proc_dump_write ("flush_all", "%d", flag); -                } else { -                        flag = request->flags.other_requests.marked_for_resume; -                        gf_proc_dump_write ("marked_for_resume", "%d", flag); +                        flag = req->ordering.go; +                        gf_proc_dump_write ("go", "%d", flag);                  }          }  } @@ -3047,22 +1765,14 @@ wb_inode_dump (xlator_t *this, inode_t *inode)          gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET,                              wb_inode->window_current); -        gf_proc_dump_write ("aggregate_current", "%"GF_PRI_SIZET, -                            wb_inode->aggregate_current); -          gf_proc_dump_write ("op_ret", "%d", wb_inode->op_ret);          gf_proc_dump_write ("op_errno", "%d", wb_inode->op_errno);          LOCK (&wb_inode->lock);          { -                if (!list_empty (&wb_inode->request)) { -                        __wb_dump_requests (&wb_inode->request, key_prefix, 0); -                } - -                if (!list_empty (&wb_inode->passive_requests)) { -                        __wb_dump_requests (&wb_inode->passive_requests, -                                            key_prefix, 1); +                if (!list_empty (&wb_inode->all)) { +                        __wb_dump_requests (&wb_inode->all, key_prefix);                  }          }          UNLOCK (&wb_inode->lock); @@ -3074,65 +1784,6 @@ out:  int -wb_fd_dump (xlator_t *this, fd_t *fd) -{ -        wb_file_t  *wb_file                        = NULL; -        char       *path                           = NULL; -        char       key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; -        int        ret                             = -1; -        gf_boolean_t  section_added                = _gf_false; - -        gf_proc_dump_build_key (key_prefix, "xlator.performance.write-behind", -                                "wb_file"); - -        if ((fd == NULL) || (this == NULL)) { -                goto out; -        } - -        ret = TRY_LOCK(&fd->lock); -        if (ret) -                goto out; -        { -                wb_file = __wb_fd_ctx_get (this, fd); -        } -        UNLOCK(&fd->lock); - -        if (wb_file == NULL) { -                goto out; -        } - -        gf_proc_dump_add_section (key_prefix); -        section_added = _gf_true; - -        __inode_path (fd->inode, NULL, &path); -        if (path != NULL) { -                gf_proc_dump_write ("path", "%s", path); -                GF_FREE (path); -        } - -        gf_proc_dump_write ("fd", "%p", fd); - -        gf_proc_dump_write ("flags", "%d", wb_file->flags); - -        gf_proc_dump_write ("flags", "%s", -                            (wb_file->flags & O_APPEND) ? "O_APPEND" -                            : "!O_APPEND"); - -        gf_proc_dump_write ("disabled", "%d", wb_file->disabled); - -out: -        if (ret && fd && this) { -                if (_gf_false == section_added) -                        gf_proc_dump_add_section (key_prefix); -                gf_proc_dump_write ("Unable to dump the fd", -                                    "(Lock acquisition failed) %s", -                                    uuid_utoa (fd->inode->gfid)); -        } -        return 0; -} - - -int32_t  mem_acct_init (xlator_t *this)  {          int     ret = -1; @@ -3166,6 +1817,14 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,                            out); +        GF_OPTION_RECONF ("trickling-writes", conf->trickling_writes, options, +			  bool, out); + +        GF_OPTION_RECONF ("strict-O_DIRECT", conf->strict_O_DIRECT, options, +			  bool, out); + +        GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering, +			  options, bool, out);          ret = 0;  out:          return ret; @@ -3196,8 +1855,6 @@ init (xlator_t *this)                  goto out;          } -        GF_OPTION_INIT("enable-O_SYNC", conf->enable_O_SYNC, bool, out); -          /* configure 'options aggregate-size <size>' */          conf->aggregate_size = WB_AGGREGATE_SIZE; @@ -3223,16 +1880,12 @@ init (xlator_t *this)          /* configure 'option flush-behind <on/off>' */          GF_OPTION_INIT ("flush-behind", conf->flush_behind, bool, out); -        GF_OPTION_INIT ("enable-trickling-writes", -                        conf->enable_trickling_writes, bool, out); +        GF_OPTION_INIT ("trickling-writes", conf->trickling_writes, bool, out); -        this->local_pool = mem_pool_new (wb_local_t, 64); -        if (!this->local_pool) { -                ret = -1; -                gf_log (this->name, GF_LOG_ERROR, -                        "failed to create local_t's memory pool"); -                goto out; -        } +        GF_OPTION_INIT ("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out); + +        GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering, +			bool, out);          this->private = conf;          ret = 0; @@ -3266,10 +1919,7 @@ out:  struct xlator_fops fops = { -        .lookup      = wb_lookup,          .writev      = wb_writev, -        .open        = wb_open, -        .create      = wb_create,          .readv       = wb_readv,          .flush       = wb_flush,          .fsync       = wb_fsync, @@ -3278,20 +1928,21 @@ struct xlator_fops fops = {          .truncate    = wb_truncate,          .ftruncate   = wb_ftruncate,          .setattr     = wb_setattr, -        .readdirp    = wb_readdirp, +        .fsetattr    = wb_fsetattr,  }; +  struct xlator_cbks cbks = {          .forget   = wb_forget, -        .release  = wb_release,  }; +  struct xlator_dumpops dumpops = {          .priv      =  wb_priv_dump,          .inodectx  =  wb_inode_dump, -        .fdctx     =  wb_fd_dump,  }; +  struct volume_options options[] = {          { .key  = {"flush-behind"},            .type = GF_OPTION_TYPE_BOOL, @@ -3300,7 +1951,7 @@ struct volume_options options[] = {                            "translator to perform flush in background, by "                            "returning success (or any errors, if any of "                            "previous  writes were failed) to application even " -                          "before flush is sent to backend filesystem. " +                          "before flush FOP is sent to backend filesystem. "          },          { .key  = {"cache-size", "window-size"},            .type = GF_OPTION_TYPE_SIZET, @@ -3309,21 +1960,20 @@ struct volume_options options[] = {            .default_value = "1MB",            .description = "Size of the write-behind buffer for a single file "                           "(inode)." - -        }, -        { .key = {"disable-for-first-nbytes"}, -          .type = GF_OPTION_TYPE_SIZET, -          .min = 0, -          .max = 1 * GF_UNIT_MB, -          .default_value = "0",          }, -        { .key = {"enable-O_SYNC"}, +        { .key = {"trickling-writes"},            .type = GF_OPTION_TYPE_BOOL,            .default_value = "on",          }, -        { .key = {"enable-trickling-writes"}, +        { .key = {"strict-O_DIRECT"},            .type = GF_OPTION_TYPE_BOOL, -          .default_value = "on", +          .default_value = "off", +        }, +        { .key = {"strict-write-ordering"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +	  .description = "Do not let later writes overtake earlier writes even " +	                  "if they do not overlap",          },          { .key = {NULL} },  };  | 
