summaryrefslogtreecommitdiffstats
path: root/xlators/performance/write-behind/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/performance/write-behind/src')
-rw-r--r--xlators/performance/write-behind/src/Makefile.am10
-rw-r--r--xlators/performance/write-behind/src/write-behind-mem-types.h37
-rw-r--r--xlators/performance/write-behind/src/write-behind-messages.h31
-rw-r--r--xlators/performance/write-behind/src/write-behind.c5010
4 files changed, 2806 insertions, 2282 deletions
diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am
index a5ebc90bdca..a6a16fcc080 100644
--- a/xlators/performance/write-behind/src/Makefile.am
+++ b/xlators/performance/write-behind/src/Makefile.am
@@ -1,14 +1,16 @@
xlator_LTLIBRARIES = write-behind.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-write_behind_la_LDFLAGS = -module -avoidversion
+write_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
write_behind_la_SOURCES = write-behind.c
write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = write-behind-mem-types.h
+noinst_HEADERS = write-behind-mem-types.h write-behind-messages.h
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/performance/write-behind/src/write-behind-mem-types.h b/xlators/performance/write-behind/src/write-behind-mem-types.h
index 6184615195b..a0647299150 100644
--- a/xlators/performance/write-behind/src/write-behind-mem-types.h
+++ b/xlators/performance/write-behind/src/write-behind-mem-types.h
@@ -1,35 +1,24 @@
/*
- Copyright (c) 2008-2009 Gluster, Inc. <http://www.gluster.com>
- This file is part of GlusterFS.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-
#ifndef __WB_MEM_TYPES_H__
#define __WB_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_wb_mem_types_ {
- gf_wb_mt_wb_file_t = gf_common_mt_end + 1,
- gf_wb_mt_wb_local_t,
- gf_wb_mt_wb_request_t,
- gf_wb_mt_iovec,
- gf_wb_mt_wb_conf_t,
- gf_wb_mt_end
+ gf_wb_mt_wb_file_t = gf_common_mt_end + 1,
+ gf_wb_mt_wb_request_t,
+ gf_wb_mt_iovec,
+ gf_wb_mt_wb_conf_t,
+ gf_wb_mt_wb_inode_t,
+ gf_wb_mt_end
};
#endif
-
diff --git a/xlators/performance/write-behind/src/write-behind-messages.h b/xlators/performance/write-behind/src/write-behind-messages.h
new file mode 100644
index 00000000000..e9ea474879b
--- /dev/null
+++ b/xlators/performance/write-behind/src/write-behind-messages.h
@@ -0,0 +1,31 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _WRITE_BEHIND_MESSAGES_H_
+#define _WRITE_BEHIND_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(WRITE_BEHIND, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE,
+ WRITE_BEHIND_MSG_INIT_FAILED, WRITE_BEHIND_MSG_INVALID_ARGUMENT,
+ WRITE_BEHIND_MSG_NO_MEMORY, WRITE_BEHIND_MSG_SIZE_NOT_SET,
+ WRITE_BEHIND_MSG_VOL_MISCONFIGURED,
+ WRITE_BEHIND_MSG_RES_UNAVAILABLE);
+
+#endif /* _WRITE_BEHIND_MESSAGES_H_ */
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 4437afce0cf..00cfca016e6 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -1,2776 +1,3278 @@
/*
- Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-/*TODO: check for non null wb_file_data before getting wb_file */
-
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "list.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "common-utils.h"
-#include "call-stub.h"
-#include "statedump.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/defaults.h>
#include "write-behind-mem-types.h"
+#include "write-behind-messages.h"
#define MAX_VECTOR_COUNT 8
#define WB_AGGREGATE_SIZE 131072 /* 128 KB */
-#define WB_WINDOW_SIZE 1048576 /* 1MB */
-
+#define WB_WINDOW_SIZE 1048576 /* 1MB */
+
typedef struct list_head list_head_t;
struct wb_conf;
-struct wb_page;
-struct wb_file;
-
-
-typedef struct wb_file {
- int disabled;
- uint64_t disable_till;
- size_t window_conf;
- size_t window_current;
- int32_t flags;
- size_t aggregate_current;
- int32_t refcount;
- int32_t op_ret;
- int32_t op_errno;
- list_head_t request;
- list_head_t passive_requests;
- fd_t *fd;
- gf_lock_t lock;
- xlator_t *this;
-}wb_file_t;
-
+struct wb_inode;
+
+typedef struct wb_inode {
+ ssize_t window_conf;
+ ssize_t window_current;
+ ssize_t transit; /* size of data stack_wound, and yet
+ to be fulfilled (wb_fulfill_cbk).
+ used for trickling_writes
+ */
+
+ list_head_t all; /* All requests, from enqueue() till destroy().
+ Used only for resetting generation
+ number when empty.
+ */
+ list_head_t todo; /* Work to do (i.e, STACK_WIND to server).
+ Once we STACK_WIND, the entry is taken
+ off the list. If it is non-sync write,
+ then we continue to track it via @liability
+ or @temptation depending on the status
+ of its writeback.
+ */
+ list_head_t liability; /* Non-sync writes which are lied
+ (STACK_UNWIND'ed to caller) but ack
+ from server not yet complete. This
+ is the "liability" which we hold, and
+ must guarantee that dependent operations
+ which arrive later (which overlap, etc.)
+ are issued only after their dependencies
+ in this list are "fulfilled".
+
+ Server acks for entries in this list
+ shrinks the window.
+
+ The sum total of all req->write_size
+ of entries in this list must be kept less
+ than the permitted window size.
+ */
+ list_head_t temptation; /* Operations for which we are tempted
+ to 'lie' (write-behind), but temporarily
+ holding off (because of insufficient
+ window capacity, etc.)
+
+ This is the list to look at to grow
+ the window (in __wb_pick_unwinds()).
+
+ Entries typically get chosen from
+ write-behind from this list, and therefore
+ get "upgraded" to the "liability" list.
+ */
+ list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC
+ which are currently STACK_WIND'ed towards the server.
+ This is for guaranteeing that no two overlapping
+ writes are in progress at the same time. Modules
+ like eager-lock in AFR depend on this behavior.
+ */
+ list_head_t invalidate_list; /* list of wb_inodes that were marked for
+ * iatt invalidation due to requests in
+ * liability queue fulfilled while there
+ * was a readdirp session on parent
+ * directory. For a directory inode, this
+ * list points to list of children.
+ */
+ uint64_t gen; /* Liability generation number. Represents
+ the current 'state' of liability. Every
+ new addition to the liability list bumps
+ the generation number.
+
+ a newly arrived request is only required
+ to perform causal checks against the entries
+ in the liability list which were present
+ at the time of its addition. the generation
+ number at the time of its addition is stored
+ in the request and used during checks.
+
+ the liability list can grow while the request
+ waits in the todo list waiting for its
+ dependent operations to complete. however
+ it is not of the request's concern to depend
+ itself on those new entries which arrived
+ after it arrived (i.e, those that have a
+ liability generation higher than itself)
+ */
+ size_t size; /* Size of the file to catch write after EOF. */
+ gf_lock_t lock;
+ xlator_t *this;
+ inode_t *inode;
+ int dontsync; /* If positive, don't pick lies for
+ * winding. This is needed to break infinite
+ * recursion during invocation of
+ * wb_process_queue from
+ * wb_fulfill_cbk in case of an
+ * error during fulfill.
+ */
+ gf_atomic_int32_t readdirps;
+ gf_atomic_int8_t invalidate;
+
+} wb_inode_t;
typedef struct wb_request {
- list_head_t list;
- list_head_t winds;
- list_head_t unwinds;
- list_head_t other_requests;
- call_stub_t *stub;
- size_t write_size;
- int32_t refcount;
- wb_file_t *file;
- union {
- struct {
- char write_behind;
- char stack_wound;
- char got_reply;
- char virgin;
- char flush_all; /* while trying to sync to back-end,
- * don't wait till a data of size
- * equal to configured aggregate-size
- * is accumulated, instead sync
- * whatever data currently present in
- * request queue.
- */
-
- }write_request;
-
- struct {
- char marked_for_resume;
- }other_requests;
- }flags;
+ list_head_t all;
+ list_head_t todo;
+ list_head_t lie; /* either in @liability or @temptation */
+ list_head_t winds;
+ list_head_t unwinds;
+ list_head_t wip;
+
+ call_stub_t *stub;
+
+ ssize_t write_size; /* currently held size
+ (after collapsing) */
+ size_t orig_size; /* size which arrived with the request.
+ This is the size by which we grow
+ the window when unwinding the frame.
+ */
+ size_t total_size; /* valid only in @head in wb_fulfill().
+ This is the size with which we perform
+ STACK_WIND to server and therefore the
+ amount by which we shrink the window.
+ */
+
+ int op_ret;
+ int op_errno;
+
+ int32_t refcount;
+ wb_inode_t *wb_inode;
+ glusterfs_fop_t fop;
+ gf_lkowner_t lk_owner;
+ pid_t client_pid;
+ struct iobref *iobref;
+ uint64_t gen; /* inode liability state at the time of
+ request arrival */
+
+ fd_t *fd;
+ int wind_count; /* number of sync-attempts. Only
+ for debug purposes */
+ struct {
+ size_t size; /* 0 size == till infinity */
+ off_t off;
+ int append : 1; /* offset is invalid. only one
+ outstanding append at a time */
+ int tempted : 1; /* true only for non-sync writes */
+ int lied : 1; /* sin committed */
+ int fulfilled : 1; /* got server acknowledgement */
+ int go : 1; /* enough aggregating, good to go */
+ } ordering;
+
+ /* for debug purposes. A request might outlive the fop it is
+ * representing. So, preserve essential info for logging.
+ */
+ uint64_t unique;
+ uuid_t gfid;
} wb_request_t;
+typedef struct wb_conf {
+ uint64_t aggregate_size;
+ uint64_t page_size;
+ uint64_t window_size;
+ gf_boolean_t flush_behind;
+ gf_boolean_t trickling_writes;
+ gf_boolean_t strict_write_ordering;
+ gf_boolean_t strict_O_DIRECT;
+ gf_boolean_t resync_after_fsync;
+} wb_conf_t;
+
+wb_inode_t *
+__wb_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ wb_inode_t *wb_inode = NULL;
+ int ret = 0;
-struct wb_conf {
- uint64_t aggregate_size;
- uint64_t window_size;
- uint64_t disable_till;
- gf_boolean_t enable_O_SYNC;
- gf_boolean_t flush_behind;
- gf_boolean_t enable_trickling_writes;
-};
+ ret = __inode_ctx_get(inode, this, &value);
+ if (ret)
+ return NULL;
+ wb_inode = (wb_inode_t *)(unsigned long)value;
-typedef struct wb_local {
- list_head_t winds;
- int32_t flags;
- int32_t wbflags;
- struct wb_file *file;
- wb_request_t *request;
- int op_ret;
- int op_errno;
- call_frame_t *frame;
- int32_t reply_count;
-} wb_local_t;
+ return wb_inode;
+}
+wb_inode_t *
+wb_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+ wb_inode_t *wb_inode = NULL;
-typedef struct wb_conf wb_conf_t;
-typedef struct wb_page wb_page_t;
+ GF_VALIDATE_OR_GOTO("write-behind", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ wb_inode = __wb_inode_ctx_get(this, inode);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return wb_inode;
+}
-int32_t
-wb_process_queue (call_frame_t *frame, wb_file_t *file);
+static void
+wb_set_invalidate(wb_inode_t *wb_inode)
+{
+ int readdirps = 0;
+ inode_t *parent_inode = NULL;
+ wb_inode_t *wb_parent_inode = NULL;
-ssize_t
-wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds);
+ parent_inode = inode_parent(wb_inode->inode, NULL, NULL);
+ if (parent_inode)
+ wb_parent_inode = wb_inode_ctx_get(wb_inode->this, parent_inode);
-ssize_t
-__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size,
- char enable_trickling_writes);
+ if (wb_parent_inode) {
+ LOCK(&wb_parent_inode->lock);
+ {
+ readdirps = GF_ATOMIC_GET(wb_parent_inode->readdirps);
+ if (readdirps && list_empty(&wb_inode->invalidate_list)) {
+ inode_ref(wb_inode->inode);
+ GF_ATOMIC_INIT(wb_inode->invalidate, 1);
+ list_add(&wb_inode->invalidate_list,
+ &wb_parent_inode->invalidate_list);
+ }
+ }
+ UNLOCK(&wb_parent_inode->lock);
+ } else {
+ GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+ }
+
+ if (parent_inode)
+ inode_unref(parent_inode);
+
+ return;
+}
+void
+wb_process_queue(wb_inode_t *wb_inode);
-static int
-__wb_request_unref (wb_request_t *this)
-{
- int ret = -1;
+/*
+ Below is a succinct explanation of the code deciding whether two regions
+ overlap, from Pavan <tcp@gluster.com>.
- if (this->refcount <= 0) {
- gf_log ("wb-request", GF_LOG_DEBUG,
- "refcount(%d) is <= 0", this->refcount);
- goto out;
- }
+ For any two ranges to be non-overlapping, either the end of the first
+ range is lesser than the start of the second, or vice versa. Example -
- ret = --this->refcount;
- if (this->refcount == 0) {
- list_del_init (&this->list);
- if (this->stub && this->stub->fop == GF_FOP_WRITE) {
- call_stub_destroy (this->stub);
- }
+ <---------> <-------------->
+ p q x y
- GF_FREE (this);
- }
+ ( q < x ) or (y < p) = > No overlap.
-out:
- return ret;
-}
+ To check for *overlap*, we can negate this (using de morgan's laws), and
+ it becomes -
+ (q >= x ) and (y >= p)
-static int
-wb_request_unref (wb_request_t *this)
-{
- wb_file_t *file = NULL;
- int ret = 0;
+ Either that, or you write the negation using -
- if (this == NULL) {
- gf_log ("wb-request", GF_LOG_DEBUG,
- "request is NULL");
- goto out;
- }
-
- file = this->file;
- LOCK (&file->lock);
- {
- ret = __wb_request_unref (this);
- }
- UNLOCK (&file->lock);
+ if (! ((q < x) or (y < p)) ) {
+ "Overlap"
+ }
+*/
-out:
- return ret;
+gf_boolean_t
+wb_requests_overlap(wb_request_t *req1, wb_request_t *req2)
+{
+ uint64_t r1_start = 0;
+ uint64_t r1_end = 0;
+ uint64_t r2_start = 0;
+ uint64_t r2_end = 0;
+ gf_boolean_t do_overlap = _gf_false;
+
+ r1_start = req1->ordering.off;
+ if (req1->ordering.size)
+ r1_end = r1_start + req1->ordering.size - 1;
+ else
+ r1_end = ULLONG_MAX;
+
+ r2_start = req2->ordering.off;
+ if (req2->ordering.size)
+ r2_end = r2_start + req2->ordering.size - 1;
+ else
+ r2_end = ULLONG_MAX;
+
+ do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start));
+
+ return do_overlap;
}
-
-static wb_request_t *
-__wb_request_ref (wb_request_t *this)
+gf_boolean_t
+wb_requests_conflict(wb_request_t *lie, wb_request_t *req)
{
- if (this->refcount < 0) {
- gf_log ("wb-request", GF_LOG_DEBUG,
- "refcount(%d) is < 0", this->refcount);
- return NULL;
- }
+ wb_conf_t *conf = NULL;
+
+ conf = req->wb_inode->this->private;
+
+ if (lie == req)
+ /* request cannot conflict with itself */
+ return _gf_false;
+
+ if (lie->gen >= req->gen)
+ /* this liability entry was behind
+ us in the todo list */
+ return _gf_false;
- this->refcount++;
- return this;
+ if (lie->ordering.append)
+ /* all modifications wait for the completion
+ of outstanding append */
+ return _gf_true;
+
+ if (conf->strict_write_ordering)
+ /* We are sure (lie->gen < req->gen) by now. So
+ skip overlap check if strict write ordering is
+ requested and always return "conflict" against a
+ lower generation lie. */
+ return _gf_true;
+
+ return wb_requests_overlap(lie, req);
}
+wb_request_t *
+wb_liability_has_conflict(wb_inode_t *wb_inode, wb_request_t *req)
+{
+ wb_request_t *each = NULL;
+
+ list_for_each_entry(each, &wb_inode->liability, lie)
+ {
+ if (wb_requests_conflict(each, req) && (!each->ordering.fulfilled))
+ /* A fulfilled request shouldn't block another
+ * request (even a dependent one) from winding.
+ */
+ return each;
+ }
+
+ return NULL;
+}
wb_request_t *
-wb_request_ref (wb_request_t *this)
+wb_wip_has_conflict(wb_inode_t *wb_inode, wb_request_t *req)
{
- wb_file_t *file = NULL;
- if (this == NULL) {
- gf_log ("wb-request", GF_LOG_DEBUG,
- "request is NULL");
- return NULL;
- }
+ wb_request_t *each = NULL;
- file = this->file;
- LOCK (&file->lock);
- {
- this = __wb_request_ref (this);
- }
- UNLOCK (&file->lock);
+ if (req->stub->fop != GF_FOP_WRITE)
+ /* non-writes fundamentally never conflict with WIP requests */
+ return NULL;
- return this;
+ list_for_each_entry(each, &wb_inode->wip, wip)
+ {
+ if (each == req)
+ /* request never conflicts with itself,
+ though this condition should never occur.
+ */
+ continue;
+
+ if (wb_requests_overlap(each, req))
+ return each;
+ }
+
+ return NULL;
}
+static int
+__wb_request_unref(wb_request_t *req)
+{
+ int ret = -1;
+ wb_inode_t *wb_inode = NULL;
+ char gfid[64] = {
+ 0,
+ };
-wb_request_t *
-wb_enqueue (wb_file_t *file, call_stub_t *stub)
-{
- wb_request_t *request = NULL, *tmp = NULL;
- call_frame_t *frame = NULL;
- wb_local_t *local = NULL;
- struct iovec *vector = NULL;
- int32_t count = 0;
-
- request = GF_CALLOC (1, sizeof (*request), gf_wb_mt_wb_request_t);
- if (request == NULL) {
- goto out;
- }
+ wb_inode = req->wb_inode;
- INIT_LIST_HEAD (&request->list);
- INIT_LIST_HEAD (&request->winds);
- INIT_LIST_HEAD (&request->unwinds);
- INIT_LIST_HEAD (&request->other_requests);
+ if (req->refcount <= 0) {
+ uuid_utoa_r(req->gfid, gfid);
- request->stub = stub;
- request->file = file;
+ gf_msg(
+ "wb-request", GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_RES_UNAVAILABLE,
+ "(unique=%" PRIu64 ", fop=%s, gfid=%s, gen=%" PRIu64
+ "): "
+ "refcount(%d) is <= 0 ",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen, req->refcount);
+ goto out;
+ }
- frame = stub->frame;
- local = frame->local;
- if (local) {
- local->request = request;
- }
+ ret = --req->refcount;
+ if (req->refcount == 0) {
+ uuid_utoa_r(req->gfid, gfid);
- if (stub->fop == GF_FOP_WRITE) {
- vector = stub->args.writev.vector;
- count = stub->args.writev.count;
+ gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+ "(unique = %" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): destroying request, "
+ "removing from all queues",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen);
- request->write_size = iov_length (vector, count);
- if (local) {
- local->op_ret = request->write_size;
- local->op_errno = 0;
- }
+ list_del_init(&req->todo);
+ list_del_init(&req->lie);
+ list_del_init(&req->wip);
- request->flags.write_request.virgin = 1;
+ list_del_init(&req->all);
+ if (list_empty(&wb_inode->all)) {
+ wb_inode->gen = 0;
+ /* in case of accounting errors? */
+ wb_inode->window_current = 0;
}
- LOCK (&file->lock);
- {
- list_add_tail (&request->list, &file->request);
- if (stub->fop == GF_FOP_WRITE) {
- /* reference for stack winding */
- __wb_request_ref (request);
-
- /* reference for stack unwinding */
- __wb_request_ref (request);
+ list_del_init(&req->winds);
+ list_del_init(&req->unwinds);
- file->aggregate_current += request->write_size;
- } else {
- list_for_each_entry (tmp, &file->request, list) {
- if (tmp->stub && tmp->stub->fop
- == GF_FOP_WRITE) {
- tmp->flags.write_request.flush_all = 1;
- }
- }
-
- /*reference for resuming */
- __wb_request_ref (request);
- }
+ if (req->stub) {
+ call_stub_destroy(req->stub);
+ req->stub = NULL;
}
- UNLOCK (&file->lock);
+ if (req->iobref)
+ iobref_unref(req->iobref);
+
+ if (req->fd)
+ fd_unref(req->fd);
+
+ GF_FREE(req);
+ }
out:
- return request;
+ return ret;
}
-
-wb_file_t *
-wb_file_create (xlator_t *this, fd_t *fd, int32_t flags)
+static int
+wb_request_unref(wb_request_t *req)
{
- wb_file_t *file = NULL;
- wb_conf_t *conf = this->private;
-
- file = GF_CALLOC (1, sizeof (*file), gf_wb_mt_wb_file_t);
- if (file == NULL) {
- goto out;
- }
+ wb_inode_t *wb_inode = NULL;
+ int ret = -1;
- INIT_LIST_HEAD (&file->request);
- INIT_LIST_HEAD (&file->passive_requests);
+ GF_VALIDATE_OR_GOTO("write-behind", req, out);
- /*
- fd_ref() not required, file should never decide the existance of
- an fd
- */
- file->fd= fd;
- file->disable_till = conf->disable_till;
- file->this = this;
- file->refcount = 1;
- file->window_conf = conf->window_size;
- file->flags = flags;
+ wb_inode = req->wb_inode;
- fd_ctx_set (fd, this, (uint64_t)(long)file);
+ LOCK(&wb_inode->lock);
+ {
+ ret = __wb_request_unref(req);
+ }
+ UNLOCK(&wb_inode->lock);
out:
- return file;
+ return ret;
}
-void
-wb_file_destroy (wb_file_t *file)
+static wb_request_t *
+__wb_request_ref(wb_request_t *req)
{
- int32_t refcount = 0;
+ GF_VALIDATE_OR_GOTO("write-behind", req, out);
- LOCK (&file->lock);
- {
- refcount = --file->refcount;
- }
- UNLOCK (&file->lock);
+ if (req->refcount < 0) {
+ gf_msg("wb-request", GF_LOG_WARNING, 0,
+ WRITE_BEHIND_MSG_RES_UNAVAILABLE, "refcount(%d) is < 0",
+ req->refcount);
+ req = NULL;
+ goto out;
+ }
- if (!refcount){
- LOCK_DESTROY (&file->lock);
- GF_FREE (file);
- }
+ req->refcount++;
- return;
+out:
+ return req;
}
+wb_request_t *
+wb_request_ref(wb_request_t *req)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("write-behind", req, out);
-int32_t
-wb_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf)
+ wb_inode = req->wb_inode;
+ LOCK(&wb_inode->lock);
+ {
+ req = __wb_request_ref(req);
+ }
+ UNLOCK(&wb_inode->lock);
+
+out:
+ return req;
+}
+
+gf_boolean_t
+wb_enqueue_common(wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
{
- wb_local_t *local = NULL;
- list_head_t *winds = NULL;
- wb_file_t *file = NULL;
- wb_request_t *request = NULL, *dummy = NULL;
- wb_local_t *per_request_local = NULL;
- int32_t ret = -1;
- fd_t *fd = NULL;
+ wb_request_t *req = NULL;
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out);
+ GF_VALIDATE_OR_GOTO(wb_inode->this->name, stub, out);
+
+ req = GF_CALLOC(1, sizeof(*req), gf_wb_mt_wb_request_t);
+ if (!req)
+ goto out;
+
+ INIT_LIST_HEAD(&req->all);
+ INIT_LIST_HEAD(&req->todo);
+ INIT_LIST_HEAD(&req->lie);
+ INIT_LIST_HEAD(&req->winds);
+ INIT_LIST_HEAD(&req->unwinds);
+ INIT_LIST_HEAD(&req->wip);
+
+ req->stub = stub;
+ req->wb_inode = wb_inode;
+ req->fop = stub->fop;
+ req->ordering.tempted = tempted;
+ req->unique = stub->frame->root->unique;
+
+ inode = ((stub->args.fd != NULL) ? stub->args.fd->inode
+ : stub->args.loc.inode);
+
+ if (inode)
+ gf_uuid_copy(req->gfid, inode->gfid);
+
+ if (stub->fop == GF_FOP_WRITE) {
+ req->write_size = iov_length(stub->args.vector, stub->args.count);
+
+ /* req->write_size can change as we collapse
+ small writes. But the window needs to grow
+ only by how much we acknowledge the app. so
+ copy the original size in orig_size for the
+ purpose of accounting.
+ */
+ req->orig_size = req->write_size;
+ /* Let's be optimistic that we can
+ lie about it
+ */
+ req->op_ret = req->write_size;
+ req->op_errno = 0;
+
+ if (stub->args.fd && (stub->args.fd->flags & O_APPEND))
+ req->ordering.append = 1;
+ }
+
+ req->lk_owner = stub->frame->root->lk_owner;
+ req->client_pid = stub->frame->root->pid;
+
+ switch (stub->fop) {
+ case GF_FOP_WRITE:
+ LOCK(&wb_inode->lock);
+ {
+ if (wb_inode->size < stub->args.offset) {
+ req->ordering.off = wb_inode->size;
+ req->ordering.size = stub->args.offset + req->write_size -
+ wb_inode->size;
+ } else {
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = req->write_size;
+ }
- local = frame->local;
- winds = &local->winds;
- file = local->file;
+ if (wb_inode->size < stub->args.offset + req->write_size)
+ wb_inode->size = stub->args.offset + req->write_size;
+ }
+ UNLOCK(&wb_inode->lock);
- LOCK (&file->lock);
- {
- list_for_each_entry_safe (request, dummy, winds, winds) {
- request->flags.write_request.got_reply = 1;
+ req->fd = fd_ref(stub->args.fd);
- if (!request->flags.write_request.write_behind
- && (op_ret == -1)) {
- per_request_local = request->stub->frame->local;
- per_request_local->op_ret = op_ret;
- per_request_local->op_errno = op_errno;
- }
+ break;
+ case GF_FOP_READ:
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = stub->args.size;
- if (request->flags.write_request.write_behind) {
- file->window_current -= request->write_size;
- }
+ req->fd = fd_ref(stub->args.fd);
- __wb_request_unref (request);
- }
-
- if (op_ret == -1) {
- file->op_ret = op_ret;
- file->op_errno = op_errno;
- }
- fd = file->fd;
- }
- UNLOCK (&file->lock);
-
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- LOCK (&file->lock);
- {
- file->op_ret = -1;
- file->op_errno = ENOMEM;
- }
- UNLOCK (&file->lock);
+ break;
+ case GF_FOP_TRUNCATE:
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = 0; /* till infinity */
+ LOCK(&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK(&wb_inode->lock);
+ break;
+ case GF_FOP_FTRUNCATE:
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = 0; /* till infinity */
+ LOCK(&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK(&wb_inode->lock);
+
+ req->fd = fd_ref(stub->args.fd);
+
+ break;
+ default:
+ if (stub && stub->args.fd)
+ req->fd = fd_ref(stub->args.fd);
+
+ break;
+ }
+
+ LOCK(&wb_inode->lock);
+ {
+ list_add_tail(&req->all, &wb_inode->all);
+
+ req->gen = wb_inode->gen;
+
+ list_add_tail(&req->todo, &wb_inode->todo);
+ __wb_request_ref(req); /* for wind */
+
+ if (req->ordering.tempted) {
+ list_add_tail(&req->lie, &wb_inode->temptation);
+ __wb_request_ref(req); /* for unwind */
}
+ }
+ UNLOCK(&wb_inode->lock);
- /* safe place to do fd_unref */
- fd_unref (fd);
+out:
+ if (!req)
+ return _gf_false;
- STACK_DESTROY (frame->root);
+ return _gf_true;
+}
- return 0;
+gf_boolean_t
+wb_enqueue(wb_inode_t *wb_inode, call_stub_t *stub)
+{
+ return wb_enqueue_common(wb_inode, stub, 0);
}
+gf_boolean_t
+wb_enqueue_tempted(wb_inode_t *wb_inode, call_stub_t *stub)
+{
+ return wb_enqueue_common(wb_inode, stub, 1);
+}
-ssize_t
-wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
+wb_inode_t *
+__wb_inode_create(xlator_t *this, inode_t *inode)
{
- wb_request_t *dummy = NULL, *request = NULL;
- wb_request_t *first_request = NULL, *next = NULL;
- size_t total_count = 0, count = 0;
- size_t copied = 0;
- call_frame_t *sync_frame = NULL;
- struct iobref *iobref = NULL;
- wb_local_t *local = NULL;
- struct iovec *vector = NULL;
- ssize_t current_size = 0, bytes = 0;
- size_t bytecount = 0;
- wb_conf_t *conf = NULL;
- fd_t *fd = NULL;
- int32_t op_errno = -1;
+ wb_inode_t *wb_inode = NULL;
+ wb_conf_t *conf = NULL;
+ int ret = 0;
- if (frame == NULL) {
- op_errno = EINVAL;
- goto out;
- }
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- conf = file->this->private;
- list_for_each_entry (request, winds, winds) {
- total_count += request->stub->args.writev.count;
- if (total_count > 0) {
- break;
- }
- }
+ conf = this->private;
- if (total_count == 0) {
- gf_log (file->this->name, GF_LOG_TRACE, "no vectors are to be"
- "synced");
- goto out;
- }
-
- list_for_each_entry_safe (request, dummy, winds, winds) {
- if (!vector) {
- vector = GF_MALLOC (VECTORSIZE (MAX_VECTOR_COUNT),
- gf_wb_mt_iovec);
- if (vector == NULL) {
- bytes = -1;
- op_errno = ENOMEM;
- gf_log (file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
-
- iobref = iobref_new ();
- if (iobref == NULL) {
- bytes = -1;
- op_errno = ENOMEM;
- gf_log (file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
-
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- bytes = -1;
- op_errno = ENOMEM;
- gf_log (file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
-
- INIT_LIST_HEAD (&local->winds);
-
- first_request = request;
- current_size = 0;
- }
+ wb_inode = GF_CALLOC(1, sizeof(*wb_inode), gf_wb_mt_wb_inode_t);
+ if (!wb_inode)
+ goto out;
- count += request->stub->args.writev.count;
- bytecount = VECTORSIZE (request->stub->args.writev.count);
- memcpy (((char *)vector)+copied,
- request->stub->args.writev.vector,
- bytecount);
- copied += bytecount;
-
- current_size += request->write_size;
-
- if (request->stub->args.writev.iobref) {
- iobref_merge (iobref,
- request->stub->args.writev.iobref);
- }
+ INIT_LIST_HEAD(&wb_inode->all);
+ INIT_LIST_HEAD(&wb_inode->todo);
+ INIT_LIST_HEAD(&wb_inode->liability);
+ INIT_LIST_HEAD(&wb_inode->temptation);
+ INIT_LIST_HEAD(&wb_inode->wip);
+ INIT_LIST_HEAD(&wb_inode->invalidate_list);
- next = NULL;
- if (request->winds.next != winds) {
- next = list_entry (request->winds.next,
- wb_request_t, winds);
- }
+ wb_inode->this = this;
- list_del_init (&request->winds);
- list_add_tail (&request->winds, &local->winds);
-
- if ((!next)
- || ((count + next->stub->args.writev.count)
- > MAX_VECTOR_COUNT)
- || ((current_size + next->write_size)
- > conf->aggregate_size))
- {
- sync_frame = copy_frame (frame);
- if (sync_frame == NULL) {
- bytes = -1;
- op_errno = ENOMEM;
- gf_log (file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
-
- sync_frame->local = local;
- local->file = file;
-
- LOCK (&file->lock);
- {
- fd = file->fd;
- }
- UNLOCK (&file->lock);
-
- fd_ref (fd);
-
- bytes += current_size;
- STACK_WIND (sync_frame,
- wb_sync_cbk,
- FIRST_CHILD(sync_frame->this),
- FIRST_CHILD(sync_frame->this)->fops->writev,
- fd, vector,
- count,
- first_request->stub->args.writev.off,
- iobref);
-
- iobref_unref (iobref);
- GF_FREE (vector);
- first_request = NULL;
- iobref = NULL;
- vector = NULL;
- sync_frame = NULL;
- local = NULL;
- copied = count = 0;
- }
- }
+ wb_inode->window_conf = conf->window_size;
+ wb_inode->inode = inode;
+
+ LOCK_INIT(&wb_inode->lock);
+ GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+ GF_ATOMIC_INIT(wb_inode->readdirps, 0);
+
+ ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)wb_inode);
+ if (ret) {
+ GF_FREE(wb_inode);
+ wb_inode = NULL;
+ }
out:
- if (sync_frame != NULL) {
- sync_frame->local = NULL;
- STACK_DESTROY (sync_frame->root);
- }
+ return wb_inode;
+}
- if (local != NULL) {
- /* had we winded these requests, we would have unrefed
- * in wb_sync_cbk.
- */
- list_for_each_entry_safe (request, dummy, &local->winds,
- winds) {
- wb_request_unref (request);
- }
+wb_inode_t *
+wb_inode_create(xlator_t *this, inode_t *inode)
+{
+ wb_inode_t *wb_inode = NULL;
- GF_FREE (local);
- local = NULL;
- }
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- if (iobref != NULL) {
- iobref_unref (iobref);
- }
+ LOCK(&inode->lock);
+ {
+ wb_inode = __wb_inode_ctx_get(this, inode);
+ if (!wb_inode)
+ wb_inode = __wb_inode_create(this, inode);
+ }
+ UNLOCK(&inode->lock);
- if (vector != NULL) {
- GF_FREE (vector);
- }
+out:
+ return wb_inode;
+}
- if (bytes == -1) {
- /*
- * had we winded these requests, we would have unrefed
- * in wb_sync_cbk.
- */
- if (local) {
- list_for_each_entry_safe (request, dummy, &local->winds,
- winds) {
- wb_request_unref (request);
- }
- }
+void
+wb_inode_destroy(wb_inode_t *wb_inode)
+{
+ GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out);
- if (file != NULL) {
- LOCK (&file->lock);
- {
- file->op_ret = -1;
- file->op_errno = op_errno;
- }
- UNLOCK (&file->lock);
- }
- }
+ GF_ASSERT(list_empty(&wb_inode->todo));
+ GF_ASSERT(list_empty(&wb_inode->liability));
+ GF_ASSERT(list_empty(&wb_inode->temptation));
+
+ LOCK_DESTROY(&wb_inode->lock);
+ GF_FREE(wb_inode);
+out:
+ return;
+}
+
+void
+__wb_fulfill_request(wb_request_t *req)
+{
+ wb_inode_t *wb_inode = NULL;
+ char gfid[64] = {
+ 0,
+ };
+
+ wb_inode = req->wb_inode;
+
+ req->ordering.fulfilled = 1;
+ wb_inode->window_current -= req->total_size;
+ wb_inode->transit -= req->total_size;
+
+ uuid_utoa_r(req->gfid, gfid);
+
+ gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): request fulfilled. "
+ "removing the request from liability queue? = %s",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen,
+ req->ordering.lied ? "yes" : "no");
+
+ if (req->ordering.lied) {
+ /* 1. If yes, request is in liability queue and hence can be
+ safely removed from list.
+ 2. If no, request is in temptation queue and hence should be
+ left in the queue so that wb_pick_unwinds picks it up
+ */
+ list_del_init(&req->lie);
+ } else {
+ /* TODO: fail the req->frame with error if
+ necessary
+ */
+ }
- return bytes;
+ list_del_init(&req->wip);
+ __wb_request_unref(req);
}
+/* get a flush/fsync waiting on req */
+wb_request_t *
+__wb_request_waiting_on(wb_request_t *req)
+{
+ wb_inode_t *wb_inode = NULL;
+ wb_request_t *trav = NULL;
+
+ wb_inode = req->wb_inode;
+
+ list_for_each_entry(trav, &wb_inode->todo, todo)
+ {
+ if (((trav->stub->fop == GF_FOP_FLUSH) ||
+ (trav->stub->fop == GF_FOP_FSYNC)) &&
+ (trav->gen >= req->gen))
+ return trav;
+ }
+
+ return NULL;
+}
-int32_t
-wb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *buf)
+void
+__wb_add_request_for_retry(wb_request_t *req)
{
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- call_frame_t *process_frame = NULL;
- wb_file_t *file = NULL;
- int32_t ret = -1;
- fd_t *fd = NULL;
-
- local = frame->local;
- file = local->file;
+ wb_inode_t *wb_inode = NULL;
- request = local->request;
- if (request) {
- process_frame = copy_frame (frame);
- if (process_frame == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
- }
+ if (!req)
+ goto out;
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf);
+ wb_inode = req->wb_inode;
- if (request != NULL) {
- wb_request_unref (request);
- }
+ /* response was unwound and no waiter waiting on this request, retry
+ till a flush or fsync (subject to conf->resync_after_fsync).
+ */
+ wb_inode->transit -= req->total_size;
- if (process_frame != NULL) {
- ret = wb_process_queue (process_frame, file);
- if ((ret == -1) && (errno == ENOMEM) && (file != NULL)) {
- LOCK (&file->lock);
- {
- file->op_ret = -1;
- file->op_errno = ENOMEM;
- }
- UNLOCK (&file->lock);
- }
+ req->total_size = 0;
- STACK_DESTROY (process_frame->root);
- }
+ list_del_init(&req->winds);
+ list_del_init(&req->todo);
+ list_del_init(&req->wip);
- if (file) {
- LOCK (&file->lock);
- {
- fd = file->fd;
- }
- UNLOCK (&file->lock);
+ /* sanitize ordering flags to retry */
+ req->ordering.go = 0;
- fd_unref (fd);
- }
+ /* Add back to todo list to retry */
+ list_add(&req->todo, &wb_inode->todo);
- return 0;
+out:
+ return;
}
-
-static int32_t
-wb_stat_helper (call_frame_t *frame, xlator_t *this, loc_t *loc)
+void
+__wb_add_head_for_retry(wb_request_t *head)
{
- STACK_WIND (frame, wb_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc);
- return 0;
+ wb_request_t *req = NULL, *tmp = NULL;
+
+ if (!head)
+ goto out;
+
+ list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds)
+ {
+ __wb_add_request_for_retry(req);
+ }
+
+ __wb_add_request_for_retry(head);
+
+out:
+ return;
}
+void
+wb_add_head_for_retry(wb_request_t *head)
+{
+ if (!head)
+ goto out;
-int32_t
-wb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- wb_file_t *file = NULL;
- fd_t *iter_fd = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1, op_errno = EINVAL;
-
- if (loc->inode) {
- /* FIXME: fd_lookup extends life of fd till stat returns */
- iter_fd = fd_lookup (loc->inode, frame->root->pid);
- if (iter_fd) {
- if (!fd_ctx_get (iter_fd, this, &tmp_file)) {
- file = (wb_file_t *)(long)tmp_file;
- } else {
- fd_unref (iter_fd);
- iter_fd = NULL;
- }
- }
- }
+ LOCK(&head->wb_inode->lock);
+ {
+ __wb_add_head_for_retry(head);
+ }
+ UNLOCK(&head->wb_inode->lock);
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+out:
+ return;
+}
- local->file = file;
+void
+__wb_fulfill_request_err(wb_request_t *req, int32_t op_errno)
+{
+ wb_inode_t *wb_inode = NULL;
+ wb_request_t *waiter = NULL;
+ wb_conf_t *conf = NULL;
- frame->local = local;
+ wb_inode = req->wb_inode;
- if (file) {
- stub = fop_stat_stub (frame, wb_stat_helper, loc);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ conf = wb_inode->this->private;
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ req->op_ret = -1;
+ req->op_errno = op_errno;
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ if (req->ordering.lied)
+ waiter = __wb_request_waiting_on(req);
+ if (!req->ordering.lied || waiter) {
+ if (!req->ordering.lied) {
+ /* response to app is still pending, send failure in
+ * response.
+ */
} else {
- STACK_WIND (frame, wb_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc);
+ /* response was sent, store the error in a
+ * waiter (either an fsync or flush).
+ */
+ waiter->op_ret = -1;
+ waiter->op_errno = op_errno;
}
- return 0;
-
-unwind:
- STACK_UNWIND_STRICT (stat, frame, -1, op_errno, NULL);
- if (stub) {
- call_stub_destroy (stub);
- }
-
- if (iter_fd != NULL) {
- fd_unref (iter_fd);
+ if (!req->ordering.lied || (waiter->stub->fop == GF_FOP_FLUSH) ||
+ ((waiter->stub->fop == GF_FOP_FSYNC) &&
+ !conf->resync_after_fsync)) {
+ /* No retry needed, forget the request */
+ __wb_fulfill_request(req);
+ return;
}
+ }
- return 0;
-}
+ __wb_add_request_for_retry(req);
+ return;
+}
-int32_t
-wb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *buf)
+void
+wb_head_done(wb_request_t *head)
{
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
- int32_t ret = -1;
-
- local = frame->local;
- file = local->file;
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
+ wb_inode_t *wb_inode = NULL;
- request = local->request;
- if ((file != NULL) && (request != NULL)) {
- wb_request_unref (request);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
- }
+ wb_inode = head->wb_inode;
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf);
+ LOCK(&wb_inode->lock);
+ {
+ list_for_each_entry_safe(req, tmp, &head->winds, winds)
+ {
+ __wb_fulfill_request(req);
+ }
- return 0;
+ __wb_fulfill_request(head);
+ }
+ UNLOCK(&wb_inode->lock);
}
-
-int32_t
-wb_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd)
+void
+__wb_fulfill_err(wb_request_t *head, int op_errno)
{
- STACK_WIND (frame,
- wb_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd);
- return 0;
-}
+ wb_request_t *req = NULL, *tmp = NULL;
+
+ if (!head)
+ goto out;
+
+ head->wb_inode->dontsync++;
+ list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds)
+ {
+ __wb_fulfill_request_err(req, op_errno);
+ }
+
+ __wb_fulfill_request_err(head, op_errno);
+
+out:
+ return;
+}
-int32_t
-wb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd)
+void
+wb_fulfill_err(wb_request_t *head, int op_errno)
{
- wb_file_t *file = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1;
- int op_errno = EINVAL;
+ wb_inode_t *wb_inode = NULL;
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
+ wb_inode = head->wb_inode;
- STACK_UNWIND_STRICT (fstat, frame, -1, EBADFD, NULL);
- return 0;
- }
+ LOCK(&wb_inode->lock);
+ {
+ __wb_fulfill_err(head, op_errno);
+ }
+ UNLOCK(&wb_inode->lock);
+}
- file = (wb_file_t *)(long)tmp_file;
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, NULL);
- return 0;
- }
+void
+__wb_modify_write_request(wb_request_t *req, int synced_size)
+{
+ struct iovec *vector = NULL;
+ int count = 0;
- local->file = file;
+ if (!req || synced_size == 0)
+ goto out;
- frame->local = local;
+ req->write_size -= synced_size;
+ req->stub->args.offset += synced_size;
- if (file) {
- stub = fop_fstat_stub (frame, wb_fstat_helper, fd);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
-
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ vector = req->stub->args.vector;
+ count = req->stub->args.count;
- /*
- FIXME:should the request queue be emptied in case of error?
- */
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
- } else {
- STACK_WIND (frame,
- wb_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd);
- }
+ req->stub->args.count = iov_skip(vector, count, synced_size);
- return 0;
-unwind:
- STACK_UNWIND_STRICT (fstat, frame, -1, op_errno, NULL);
+out:
+ return;
+}
- if (stub) {
- call_stub_destroy (stub);
- }
+int
+__wb_fulfill_short_write(wb_request_t *req, int size, gf_boolean_t *fulfilled)
+{
+ int accounted_size = 0;
- return 0;
+ if (req == NULL)
+ goto out;
+
+ if (req->write_size <= size) {
+ accounted_size = req->write_size;
+ __wb_fulfill_request(req);
+ *fulfilled = 1;
+ } else {
+ accounted_size = size;
+ __wb_modify_write_request(req, size);
+ *fulfilled = 0;
+ }
+
+out:
+ return accounted_size;
}
+void
+wb_fulfill_short_write(wb_request_t *head, int size)
+{
+ wb_inode_t *wb_inode = NULL;
+ wb_request_t *req = NULL, *next = NULL;
+ int accounted_size = 0;
+ gf_boolean_t fulfilled = _gf_false;
+
+ if (!head)
+ goto out;
+
+ wb_inode = head->wb_inode;
+
+ req = head;
+
+ LOCK(&wb_inode->lock);
+ {
+ /* hold a reference to head so that __wb_fulfill_short_write
+ * won't free it. We need head for a cleaner list traversal as
+ * list_for_each_entry_safe doesn't iterate over "head" member.
+ * So, if we pass "next->winds" as head to list_for_each_entry,
+ * "next" is skipped. For a simpler logic we need to traverse
+ * the list in the order. So, we start traversal from
+ * "head->winds" and hence we want head to be alive.
+ */
+ __wb_request_ref(head);
-int32_t
-wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
- call_frame_t *process_frame = NULL;
- int32_t ret = -1;
- fd_t *fd = NULL;
-
- local = frame->local;
- file = local->file;
- request = local->request;
-
- if ((request != NULL) && (file != NULL)) {
- process_frame = copy_frame (frame);
- if (process_frame == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
- }
+ next = list_entry(head->winds.next, wb_request_t, winds);
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf);
+ accounted_size = __wb_fulfill_short_write(head, size, &fulfilled);
- if (request) {
- wb_request_unref (request);
- }
+ size -= accounted_size;
- if (process_frame != NULL) {
- ret = wb_process_queue (process_frame, file);
- if ((ret == -1) && (errno == ENOMEM) && (file != NULL)) {
- LOCK (&file->lock);
- {
- file->op_ret = -1;
- file->op_errno = ENOMEM;
- }
- UNLOCK (&file->lock);
- }
+ if (size == 0) {
+ if (fulfilled && (next != head))
+ req = next;
- STACK_DESTROY (process_frame->root);
+ goto done;
}
- if (file) {
- LOCK (&file->lock);
- {
- fd = file->fd;
- }
- UNLOCK (&file->lock);
+ list_for_each_entry_safe(req, next, &head->winds, winds)
+ {
+ accounted_size = __wb_fulfill_short_write(req, size, &fulfilled);
+ size -= accounted_size;
- fd_unref (fd);
+ if (size == 0) {
+ if (fulfilled && (next != head))
+ req = next;
+ break;
+ }
}
+ done:
+ __wb_request_unref(head);
+ }
+ UNLOCK(&wb_inode->lock);
- return 0;
+ wb_add_head_for_retry(req);
+out:
+ return;
}
-
-static int32_t
-wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset)
+int
+wb_fulfill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- STACK_WIND (frame,
- wb_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc,
- offset);
+ wb_inode_t *wb_inode = NULL;
+ wb_request_t *head = NULL;
+
+ head = frame->local;
+ frame->local = NULL;
+
+ wb_inode = head->wb_inode;
+
+ /* There could be a readdirp session in progress. Since wb_fulfill_cbk
+ * can potentially remove a request from liability queue,
+ * wb_readdirp_cbk will miss writes on this inode (as it invalidates
+ * stats only if liability queue is not empty) and hence mark inode
+ * for invalidation of stats in readdirp response. Specifically this
+ * code fixes the following race mentioned in wb_readdirp_cbk:
+ */
+
+ /* <removed comment from wb_readdirp_cbk>
+ * We cannot guarantee integrity of entry->d_stat as there are cached
+ * writes. The stat is most likely stale as it doesn't account the
+ * cached writes. However, checking for non-empty liability list here is
+ * not a fool-proof solution as there can be races like,
+ * 1. readdirp is successful on posix
+ * 2. sync of cached write is successful on posix
+ * 3. write-behind received sync response and removed the request from
+ * liability queue
+ * 4. readdirp response is processed at write-behind
+ *
+ * In the above scenario, stat for the file is sent back in readdirp
+ * response but it is stale.
+ * </comment> */
+ wb_set_invalidate(wb_inode);
+
+ if (op_ret == -1) {
+ wb_fulfill_err(head, op_errno);
+ } else if (op_ret < head->total_size) {
+ wb_fulfill_short_write(head, op_ret);
+ } else {
+ wb_head_done(head);
+ }
+
+ wb_process_queue(wb_inode);
+
+ STACK_DESTROY(frame->root);
+
+ return 0;
+}
- return 0;
+#define WB_IOV_LOAD(vec, cnt, req, head) \
+ do { \
+ memcpy(&vec[cnt], req->stub->args.vector, \
+ (req->stub->args.count * sizeof(vec[0]))); \
+ cnt += req->stub->args.count; \
+ head->total_size += req->write_size; \
+ } while (0)
+
+int
+wb_fulfill_head(wb_inode_t *wb_inode, wb_request_t *head)
+{
+ struct iovec vector[MAX_VECTOR_COUNT];
+ int count = 0;
+ wb_request_t *req = NULL;
+ call_frame_t *frame = NULL;
+
+ /* make sure head->total_size is updated before we run into any
+ * errors
+ */
+
+ WB_IOV_LOAD(vector, count, head, head);
+
+ list_for_each_entry(req, &head->winds, winds)
+ {
+ WB_IOV_LOAD(vector, count, req, head);
+
+ if (iobref_merge(head->stub->args.iobref, req->stub->args.iobref))
+ goto err;
+ }
+
+ frame = create_frame(wb_inode->this, wb_inode->this->ctx->pool);
+ if (!frame)
+ goto err;
+
+ frame->root->lk_owner = head->lk_owner;
+ frame->root->pid = head->client_pid;
+ frame->local = head;
+
+ LOCK(&wb_inode->lock);
+ {
+ wb_inode->transit += head->total_size;
+ }
+ UNLOCK(&wb_inode->lock);
+
+ STACK_WIND(frame, wb_fulfill_cbk, FIRST_CHILD(frame->this),
+ FIRST_CHILD(frame->this)->fops->writev, head->fd, vector, count,
+ head->stub->args.offset, head->stub->args.flags,
+ head->stub->args.iobref, NULL);
+
+ return 0;
+err:
+ /* frame creation failure */
+ wb_fulfill_err(head, ENOMEM);
+
+ return ENOMEM;
}
+#define NEXT_HEAD(head, req) \
+ do { \
+ if (head) \
+ ret |= wb_fulfill_head(wb_inode, head); \
+ head = req; \
+ expected_offset = req->stub->args.offset + req->write_size; \
+ curr_aggregate = 0; \
+ vector_count = 0; \
+ } while (0)
-int32_t
-wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset)
+int
+wb_fulfill(wb_inode_t *wb_inode, list_head_t *liabilities)
{
- wb_file_t *file = NULL;
- fd_t *iter_fd = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1, op_errno = ENOMEM;
+ wb_request_t *req = NULL;
+ wb_request_t *head = NULL;
+ wb_request_t *tmp = NULL;
+ wb_conf_t *conf = NULL;
+ off_t expected_offset = 0;
+ size_t curr_aggregate = 0;
+ size_t vector_count = 0;
+ int ret = 0;
- if (loc->inode)
- {
- /*
- FIXME: fd_lookup extends life of fd till the execution of
- truncate_cbk
- */
- iter_fd = fd_lookup (loc->inode, frame->root->pid);
- if (iter_fd) {
- if (!fd_ctx_get (iter_fd, this, &tmp_file)){
- file = (wb_file_t *)(long)tmp_file;
- } else {
- fd_unref (iter_fd);
- }
- }
- }
-
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ conf = wb_inode->this->private;
- local->file = file;
-
- frame->local = local;
- if (file) {
- stub = fop_truncate_stub (frame, wb_truncate_helper, loc,
- offset);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ list_for_each_entry_safe(req, tmp, liabilities, winds)
+ {
+ list_del_init(&req->winds);
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
-
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
- } else {
- STACK_WIND (frame,
- wb_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc,
- offset);
+ if (!head) {
+ NEXT_HEAD(head, req);
+ continue;
}
- return 0;
-
-unwind:
- STACK_UNWIND_STRICT (truncate, frame, -1, op_errno, NULL, NULL);
+ if (req->fd != head->fd) {
+ NEXT_HEAD(head, req);
+ continue;
+ }
- if (stub) {
- call_stub_destroy (stub);
+ if (!is_same_lkowner(&req->lk_owner, &head->lk_owner)) {
+ NEXT_HEAD(head, req);
+ continue;
}
- return 0;
-}
+ if (expected_offset != req->stub->args.offset) {
+ NEXT_HEAD(head, req);
+ continue;
+ }
+ if ((curr_aggregate + req->write_size) > conf->aggregate_size) {
+ NEXT_HEAD(head, req);
+ continue;
+ }
-int32_t
-wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
-{
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
- int32_t ret = -1;
-
- local = frame->local;
- file = local->file;
- request = local->request;
-
- if ((request != NULL) && (file != NULL)) {
- wb_request_unref (request);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
+ if (vector_count + req->stub->args.count > MAX_VECTOR_COUNT) {
+ NEXT_HEAD(head, req);
+ continue;
}
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf);
+ list_add_tail(&req->winds, &head->winds);
+ curr_aggregate += req->write_size;
+ vector_count += req->stub->args.count;
+ }
- return 0;
-}
+ if (head)
+ ret |= wb_fulfill_head(wb_inode, head);
+ return ret;
+}
-static int32_t
-wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset)
+void
+wb_do_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
{
- STACK_WIND (frame,
- wb_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd,
- offset);
- return 0;
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
+ call_frame_t *frame = NULL;
+ struct iatt buf = {
+ 0,
+ };
+
+ list_for_each_entry_safe(req, tmp, lies, unwinds)
+ {
+ frame = req->stub->frame;
+
+ STACK_UNWIND_STRICT(writev, frame, req->op_ret, req->op_errno, &buf,
+ &buf, NULL); /* :O */
+ req->stub->frame = NULL;
+
+ list_del_init(&req->unwinds);
+ wb_request_unref(req);
+ }
+
+ return;
}
-
-int32_t
-wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset)
-{
- wb_file_t *file = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1;
- int op_errno = EINVAL;
-
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
-
- STACK_UNWIND_STRICT (ftruncate, frame, -1, EBADFD,
- NULL, NULL);
- return 0;
- }
+void
+__wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
+{
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
+ char gfid[64] = {
+ 0,
+ };
- file = (wb_file_t *)(long)tmp_file;
+ list_for_each_entry_safe(req, tmp, &wb_inode->temptation, lie)
+ {
+ if (!req->ordering.fulfilled &&
+ wb_inode->window_current > wb_inode->window_conf)
+ continue;
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM,
- NULL, NULL);
- return 0;
- }
+ list_del_init(&req->lie);
+ list_move_tail(&req->unwinds, lies);
- local->file = file;
+ wb_inode->window_current += req->orig_size;
- frame->local = local;
+ wb_inode->gen++;
- if (file) {
- stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd,
- offset);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ if (!req->ordering.fulfilled) {
+ /* burden increased */
+ list_add_tail(&req->lie, &wb_inode->liability);
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ req->ordering.lied = 1;
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
- } else {
- STACK_WIND (frame,
- wb_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd,
- offset);
+ uuid_utoa_r(req->gfid, gfid);
+ gf_msg_debug(wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): added req to liability "
+ "queue. inode-generation-number=%" PRIu64,
+ req->stub->frame->root->unique, gf_fop_list[req->fop],
+ gfid, req->gen, wb_inode->gen);
}
+ }
- return 0;
+ return;
+}
-unwind:
- STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL);
+int
+__wb_collapse_small_writes(wb_conf_t *conf, wb_request_t *holder,
+ wb_request_t *req)
+{
+ char *ptr = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ int ret = -1;
+ ssize_t required_size = 0;
+ size_t holder_len = 0;
+ size_t req_len = 0;
- if (stub) {
- call_stub_destroy (stub);
- }
+ if (!holder->iobref) {
+ holder_len = iov_length(holder->stub->args.vector,
+ holder->stub->args.count);
+ req_len = iov_length(req->stub->args.vector, req->stub->args.count);
- return 0;
-}
+ required_size = max((conf->page_size), (holder_len + req_len));
+ iobuf = iobuf_get2(req->wb_inode->this->ctx->iobuf_pool, required_size);
+ if (iobuf == NULL) {
+ goto out;
+ }
+ iobref = iobref_new();
+ if (iobref == NULL) {
+ iobuf_unref(iobuf);
+ goto out;
+ }
-int32_t
-wb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost)
-{
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- call_frame_t *process_frame = NULL;
- wb_file_t *file = NULL;
- int32_t ret = -1;
- fd_t *fd = NULL;
-
- local = frame->local;
- file = local->file;
- request = local->request;
-
- if (request) {
- process_frame = copy_frame (frame);
- if (process_frame == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
+ ret = iobref_add(iobref, iobuf);
+ if (ret != 0) {
+ gf_msg(req->wb_inode->this->name, GF_LOG_WARNING, -ret,
+ WRITE_BEHIND_MSG_INVALID_ARGUMENT,
+ "cannot add iobuf (%p) into iobref (%p)", iobuf, iobref);
+ iobuf_unref(iobuf);
+ iobref_unref(iobref);
+ goto out;
}
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre,
- statpost);
+ iov_unload(iobuf->ptr, holder->stub->args.vector,
+ holder->stub->args.count);
+ holder->stub->args.vector[0].iov_base = iobuf->ptr;
+ holder->stub->args.count = 1;
- if (request) {
- wb_request_unref (request);
- }
+ iobref_unref(holder->stub->args.iobref);
+ holder->stub->args.iobref = iobref;
- if (request && (process_frame != NULL)) {
- ret = wb_process_queue (process_frame, file);
- if ((ret == -1) && (errno == ENOMEM) && (file != NULL)) {
- LOCK (&file->lock);
- {
- file->op_ret = -1;
- file->op_errno = ENOMEM;
- }
- UNLOCK (&file->lock);
- }
+ iobuf_unref(iobuf);
- STACK_DESTROY (process_frame->root);
- }
+ holder->iobref = iobref_ref(iobref);
+ }
- if (file) {
- LOCK (&file->lock);
- {
- fd = file->fd;
- }
- UNLOCK (&file->lock);
+ ptr = holder->stub->args.vector[0].iov_base + holder->write_size;
- fd_unref (fd);
- }
+ iov_unload(ptr, req->stub->args.vector, req->stub->args.count);
- return 0;
-}
+ holder->stub->args.vector[0].iov_len += req->write_size;
+ holder->write_size += req->write_size;
+ holder->ordering.size += req->write_size;
+ ret = 0;
+out:
+ return ret;
+}
-static int32_t
-wb_setattr_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
+void
+__wb_preprocess_winds(wb_inode_t *wb_inode)
{
- STACK_WIND (frame,
- wb_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr,
- loc,
- stbuf,
- valid);
+ off_t offset_expected = 0;
+ ssize_t space_left = 0;
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
+ wb_request_t *holder = NULL;
+ wb_conf_t *conf = NULL;
+ int ret = 0;
+ ssize_t page_size = 0;
+ char gfid[64] = {
+ 0,
+ };
+
+ /* With asynchronous IO from a VM guest (as a file), there
+ can be two sequential writes happening in two regions
+ of the file. But individual (broken down) IO requests
+ can arrive interleaved.
+
+ TODO: cycle for each such sequence sifting
+ through the interleaved ops
+ */
+
+ conf = wb_inode->this->private;
+ page_size = conf->page_size;
+
+ list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo)
+ {
+ if (wb_inode->dontsync && req->ordering.lied) {
+ /* sync has failed. Don't pick lies _again_ for winding
+ * as winding these lies again will trigger an infinite
+ * recursion of wb_process_queue being called from a
+ * failed fulfill. However, pick non-lied requests for
+ * winding so that application won't block indefinitely
+ * waiting for write result.
+ */
+
+ uuid_utoa_r(req->gfid, gfid);
+ gf_msg_debug(wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): not setting ordering.go"
+ "as dontsync is set",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen);
+
+ continue;
+ }
+
+ if (!req->ordering.tempted) {
+ if (holder) {
+ if (wb_requests_conflict(holder, req))
+ /* do not hold on write if a
+ dependent write is in queue */
+ holder->ordering.go = 1;
+ }
+ /* collapse only non-sync writes */
+ continue;
+ } else if (!holder) {
+ /* holder is always a non-sync write */
+ holder = req;
+ continue;
+ }
+
+ offset_expected = holder->stub->args.offset + holder->write_size;
+
+ if (req->stub->args.offset != offset_expected) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
+ if (!is_same_lkowner(&req->lk_owner, &holder->lk_owner)) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
+ if (req->fd != holder->fd) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
+ space_left = page_size - holder->write_size;
+
+ if (space_left < req->write_size) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
+ ret = __wb_collapse_small_writes(conf, holder, req);
+ if (ret)
+ continue;
+
+ /* collapsed request is as good as wound
+ (from its p.o.v)
+ */
+ list_del_init(&req->todo);
+ __wb_fulfill_request(req);
- return 0;
+ /* Only the last @holder in queue which
+
+ - does not have any non-buffered-writes following it
+ - has not yet filled its capacity
+
+ does not get its 'go' set, in anticipation of the arrival
+ of consecutive smaller writes.
+ */
+ }
+
+ /* but if trickling writes are enabled, then do not hold back
+ writes if there are no outstanding requests
+ */
+
+ if (conf->trickling_writes && !wb_inode->transit && holder)
+ holder->ordering.go = 1;
+
+ if (wb_inode->dontsync > 0)
+ wb_inode->dontsync--;
+
+ return;
}
+int
+__wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict,
+ list_head_t *tasks)
+{
+ wb_conf_t *conf = NULL;
+ char gfid[64] = {
+ 0,
+ };
+
+ conf = req->wb_inode->this->private;
+
+ uuid_utoa_r(req->gfid, gfid);
+
+ if ((req->stub->fop != GF_FOP_FLUSH) &&
+ ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) {
+ if (!req->ordering.lied && list_empty(&conflict->wip)) {
+ /* If request itself is in liability queue,
+ * 1. We cannot unwind as the response has already been
+ * sent.
+ * 2. We cannot wind till conflict clears up.
+ * 3. So, skip the request for now.
+ * 4. Otherwise, resume (unwind) it with error.
+ */
+ req->op_ret = -1;
+ req->op_errno = conflict->op_errno;
+ if ((req->stub->fop == GF_FOP_TRUNCATE) ||
+ (req->stub->fop == GF_FOP_FTRUNCATE)) {
+ req->stub->frame->local = NULL;
+ }
+
+ list_del_init(&req->todo);
+ list_add_tail(&req->winds, tasks);
+
+ gf_msg_debug(req->wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): A conflicting write "
+ "request in liability queue has failed "
+ "to sync (error = \"%s\"), "
+ "unwinding this request as a failure",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen,
+ strerror(req->op_errno));
+
+ if (req->ordering.tempted) {
+ /* make sure that it won't be unwound in
+ * wb_do_unwinds too. Otherwise there'll be
+ * a double wind.
+ */
+ list_del_init(&req->lie);
+
+ gf_msg_debug(req->wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, "
+ "gfid=%s, gen=%" PRIu64
+ "): "
+ "removed from liability queue",
+ req->unique, gf_fop_list[req->fop], gfid,
+ req->gen);
+
+ __wb_fulfill_request(req);
+ }
+ }
+ } else {
+ gf_msg_debug(req->wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): A conflicting write request "
+ "in liability queue has failed to sync "
+ "(error = \"%s\"). This is an "
+ "FSYNC/FLUSH and we need to maintain ordering "
+ "guarantees with other writes in TODO queue. "
+ "Hence doing nothing now",
+ req->unique, gf_fop_list[req->fop], gfid, req->gen,
+ strerror(conflict->op_errno));
+
+ /* flush and fsync (without conf->resync_after_fsync) act as
+ barriers. We cannot unwind them out of
+ order, when there are earlier generation writes just because
+ there is a conflicting liability with an error. So, wait for
+ our turn till there are no conflicting liabilities.
+
+ This situation can arise when there liabilities spread across
+ multiple generations. For eg., consider two writes with
+ following characterstics:
+
+ 1. they belong to different generations gen1, gen2 and
+ (gen1 > gen2).
+ 2. they overlap.
+ 3. both are liabilities.
+ 4. gen1 write was attempted to sync, but the attempt failed.
+ 5. there was no attempt to sync gen2 write yet.
+ 6. A flush (as part of close) is issued and gets a gen no
+ gen3.
+
+ In the above scenario, if flush is unwound without waiting
+ for gen1 and gen2 writes either to be successfully synced or
+ purged, we end up with these two writes in wb_inode->todo
+ list forever as there will be no attempt to process the queue
+ as flush is the last operation.
+ */
+ }
+
+ return 0;
+}
-int32_t
-wb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid)
+int
+__wb_pick_winds(wb_inode_t *wb_inode, list_head_t *tasks,
+ list_head_t *liabilities)
{
- wb_file_t *file = NULL;
- fd_t *iter_fd = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1, op_errno = EINVAL;
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
+ wb_request_t *conflict = NULL;
+ char req_gfid[64] =
+ {
+ 0,
+ },
+ conflict_gfid[64] = {
+ 0,
+ };
+
+ list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo)
+ {
+ uuid_utoa_r(req->gfid, req_gfid);
+
+ conflict = wb_liability_has_conflict(wb_inode, req);
+ if (conflict) {
+ uuid_utoa_r(conflict->gfid, conflict_gfid);
+
+ gf_msg_debug(wb_inode->this->name, 0,
+ "Not winding request due to a "
+ "conflicting write in liability queue. "
+ "REQ: unique=%" PRIu64
+ ", fop=%s, "
+ "gen=%" PRIu64
+ ", gfid=%s. "
+ "CONFLICT: unique=%" PRIu64
+ ", fop=%s, "
+ "gen=%" PRIu64
+ ", gfid=%s, "
+ "conflicts-sync-failed?=%s, "
+ "conflicts-error=%s",
+ req->unique, gf_fop_list[req->fop], req->gen, req_gfid,
+ conflict->unique, gf_fop_list[conflict->fop],
+ conflict->gen, conflict_gfid,
+ (conflict->op_ret == 1) ? "yes" : "no",
+ strerror(conflict->op_errno));
+
+ if (conflict->op_ret == -1) {
+ /* There is a conflicting liability which failed
+ * to sync in previous attempts, resume the req
+ * and fail, unless its an fsync/flush.
+ */
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
+ __wb_handle_failed_conflict(req, conflict, tasks);
+ } else {
+ /* There is a conflicting liability which was
+ * not attempted to sync even once. Wait till
+ * at least one attempt to sync is made.
+ */
+ }
+
+ continue;
+ }
+
+ if (req->ordering.tempted && !req->ordering.go) {
+ /* wait some more */
+ gf_msg_debug(wb_inode->this->name, 0,
+ "(unique=%" PRIu64 ", fop=%s, gen=%" PRIu64
+ ", gfid=%s): ordering.go is not set, "
+ "hence not winding",
+ req->unique, gf_fop_list[req->fop], req->gen,
+ req_gfid);
+ continue;
+ }
+
+ if (req->stub->fop == GF_FOP_WRITE) {
+ conflict = wb_wip_has_conflict(wb_inode, req);
+
+ if (conflict) {
+ uuid_utoa_r(conflict->gfid, conflict_gfid);
+
+ gf_msg_debug(wb_inode->this->name, 0,
+ "Not winding write request as "
+ "a conflicting write is being "
+ "synced to backend. "
+ "REQ: unique=%" PRIu64
+ " fop=%s,"
+ " gen=%" PRIu64
+ ", gfid=%s. "
+ "CONFLICT: unique=%" PRIu64
+ " "
+ "fop=%s, gen=%" PRIu64
+ ", "
+ "gfid=%s",
+ req->unique, gf_fop_list[req->fop], req->gen,
+ req_gfid, conflict->unique,
+ gf_fop_list[conflict->fop], conflict->gen,
+ conflict_gfid);
+ continue;
+ }
+
+ list_add_tail(&req->wip, &wb_inode->wip);
+ req->wind_count++;
+
+ if (!req->ordering.tempted)
+ /* unrefed in wb_writev_cbk */
+ req->stub->frame->local = __wb_request_ref(req);
+ }
+
+ gf_msg_debug(wb_inode->this->name, 0,
+ "(unique=%" PRIu64
+ ", fop=%s, gfid=%s, "
+ "gen=%" PRIu64
+ "): picking the request for "
+ "winding",
+ req->unique, gf_fop_list[req->fop], req_gfid, req->gen);
+
+ list_del_init(&req->todo);
+
+ if (req->ordering.tempted) {
+ list_add_tail(&req->winds, liabilities);
+ } else {
+ list_add_tail(&req->winds, tasks);
}
+ }
+
+ return 0;
+}
+void
+wb_do_winds(wb_inode_t *wb_inode, list_head_t *tasks)
+{
+ wb_request_t *req = NULL;
+ wb_request_t *tmp = NULL;
- frame->local = local;
+ list_for_each_entry_safe(req, tmp, tasks, winds)
+ {
+ list_del_init(&req->winds);
- if (!(valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME))) {
- STACK_WIND (frame,
- wb_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid);
- goto out;
+ if (req->op_ret == -1) {
+ call_unwind_error_keep_stub(req->stub, req->op_ret, req->op_errno);
+ } else {
+ call_resume_keep_stub(req->stub);
}
- if (loc->inode) {
- /*
- FIXME: fd_lookup extends life of fd till the execution
- of wb_utimens_cbk
- */
- iter_fd = fd_lookup (loc->inode, frame->root->pid);
- if (iter_fd) {
- if (!fd_ctx_get (iter_fd, this, &tmp_file)) {
- file = (wb_file_t *)(long)tmp_file;
- } else {
- fd_unref (iter_fd);
- }
- }
+ wb_request_unref(req);
+ }
+}
- }
+void
+wb_process_queue(wb_inode_t *wb_inode)
+{
+ list_head_t tasks;
+ list_head_t lies;
+ list_head_t liabilities;
+ int wind_failure = 0;
- local->file = file;
+ INIT_LIST_HEAD(&tasks);
+ INIT_LIST_HEAD(&lies);
+ INIT_LIST_HEAD(&liabilities);
- if (file) {
- stub = fop_setattr_stub (frame, wb_setattr_helper, loc, stbuf, valid);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ do {
+ gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+ "processing queues");
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ LOCK(&wb_inode->lock);
+ {
+ __wb_preprocess_winds(wb_inode);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
- } else {
- STACK_WIND (frame,
- wb_setattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setattr,
- loc,
- stbuf, valid);
+ __wb_pick_winds(wb_inode, &tasks, &liabilities);
+
+ __wb_pick_unwinds(wb_inode, &lies);
}
+ UNLOCK(&wb_inode->lock);
- return 0;
-unwind:
- STACK_UNWIND_STRICT (setattr, frame, -1, op_errno,
- NULL, NULL);
+ if (!list_empty(&lies))
+ wb_do_unwinds(wb_inode, &lies);
- if (stub) {
- call_stub_destroy (stub);
- }
-out:
- return 0;
+ if (!list_empty(&tasks))
+ wb_do_winds(wb_inode, &tasks);
+
+ /* If there is an error in wb_fulfill before winding write
+ * requests, we would miss invocation of wb_process_queue
+ * from wb_fulfill_cbk. So, retry processing again.
+ */
+ if (!list_empty(&liabilities))
+ wind_failure = wb_fulfill(wb_inode, &liabilities);
+ } while (wind_failure);
+
+ return;
}
-int32_t
-wb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, fd_t *fd)
+void
+wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf)
{
- int32_t wbflags = 0, flags = 0;
- wb_file_t *file = NULL;
- wb_conf_t *conf = NULL;
- wb_local_t *local = NULL;
+ GF_ASSERT(wb_inode);
+ GF_ASSERT(postbuf);
+
+ LOCK(&wb_inode->lock);
+ {
+ wb_inode->size = postbuf->ia_size;
+ }
+ UNLOCK(&wb_inode->lock);
+}
- conf = this->private;
+int
+wb_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ wb_request_t *req = NULL;
+ wb_inode_t *wb_inode;
- local = frame->local;
- if (local == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ req = frame->local;
+ frame->local = NULL;
+ wb_inode = req->wb_inode;
- flags = local->flags;
- wbflags = local->wbflags;
+ LOCK(&req->wb_inode->lock);
+ {
+ list_del_init(&req->wip);
+ }
+ UNLOCK(&req->wb_inode->lock);
- if (op_ret != -1) {
- file = wb_file_create (this, fd, flags);
- if (file == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
+ wb_request_unref(req);
- /* If O_DIRECT then, we disable chaching */
- if (((flags & O_DIRECT) == O_DIRECT)
- || ((flags & O_ACCMODE) == O_RDONLY)
- || (((flags & O_SYNC) == O_SYNC)
- && conf->enable_O_SYNC == _gf_true)) {
- file->window_conf = 0;
- }
+ /* requests could be pending while this was in progress */
+ wb_process_queue(wb_inode);
- if (wbflags & GF_OPEN_NOWB) {
- file->disabled = 1;
- }
-
- LOCK_INIT (&file->lock);
- }
-
-out:
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd);
- return 0;
+ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int
+wb_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ STACK_WIND(frame, wb_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
+}
-int32_t
-wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, int32_t wbflags)
+int
+wb_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
{
- wb_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ wb_inode_t *wb_inode = NULL;
+ wb_conf_t *conf = NULL;
+ gf_boolean_t wb_disabled = 0;
+ call_stub_t *stub = NULL;
+ int ret = -1;
+ int32_t op_errno = EINVAL;
+ int o_direct = O_DIRECT;
+
+ conf = this->private;
+
+ wb_inode = wb_inode_create(this, fd->inode);
+ if (!wb_inode) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ if (!conf->strict_O_DIRECT)
+ o_direct = 0;
+
+ if (fd->flags & (O_SYNC | O_DSYNC | o_direct))
+ wb_disabled = 1;
+
+ if (flags & (O_SYNC | O_DSYNC | o_direct))
+ wb_disabled = 1;
+
+ if (wb_disabled)
+ stub = fop_writev_stub(frame, wb_writev_helper, fd, vector, count,
+ offset, flags, iobref, xdata);
+ else
+ stub = fop_writev_stub(frame, NULL, fd, vector, count, offset, flags,
+ iobref, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ if (wb_disabled)
+ ret = wb_enqueue(wb_inode, stub);
+ else
+ ret = wb_enqueue_tempted(wb_inode, stub);
+
+ if (!ret) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ wb_process_queue(wb_inode);
+
+ return 0;
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+unwind:
+ STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL);
- local->flags = flags;
- local->wbflags = wbflags;
-
- frame->local = local;
+ if (stub)
+ call_stub_destroy(stub);
- STACK_WIND (frame,
- wb_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, wbflags);
- return 0;
+ return 0;
+}
-unwind:
- STACK_UNWIND_STRICT (open, frame, -1, op_errno, NULL);
- return 0;
+int
+wb_readv_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+ return 0;
}
+int
+wb_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
-int32_t
-wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent)
-{
- long flags = 0;
- wb_file_t *file = NULL;
- wb_conf_t *conf = this->private;
-
- if (op_ret != -1) {
- if (frame->local) {
- flags = (long) frame->local;
- }
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- file = wb_file_create (this, fd, flags);
- if (file == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
+ stub = fop_readv_stub(frame, wb_readv_helper, fd, size, offset, flags,
+ xdata);
+ if (!stub)
+ goto unwind;
- /* If O_DIRECT then, we disable chaching */
- if (frame->local) {
- if (((flags & O_DIRECT) == O_DIRECT)
- || ((flags & O_ACCMODE) == O_RDONLY)
- || (((flags & O_SYNC) == O_SYNC)
- && (conf->enable_O_SYNC == _gf_true))) {
- file->window_conf = 0;
- }
- }
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- LOCK_INIT (&file->lock);
- }
-
- frame->local = NULL;
+ wb_process_queue(wb_inode);
-out:
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent);
- return 0;
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
+
+noqueue:
+ STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+ return 0;
}
+int
+wb_flush_bg_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
+}
-int32_t
-wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, fd_t *fd, dict_t *params)
+int
+wb_flush_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- frame->local = (void *)(long)flags;
+ wb_conf_t *conf = NULL;
+ wb_inode_t *wb_inode = NULL;
+ call_frame_t *bg_frame = NULL;
+ int32_t op_errno = 0;
+ int op_ret = 0;
+
+ conf = this->private;
+
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto unwind;
+ }
+
+ if (conf->flush_behind)
+ goto flushbehind;
+
+ STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+
+flushbehind:
+ bg_frame = copy_frame(frame);
+ if (!bg_frame) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ STACK_WIND(bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ /* fall through */
+unwind:
+ STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL);
- STACK_WIND (frame, wb_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, fd, params);
- return 0;
+ return 0;
}
-/* Mark all the contiguous write requests for winding starting from head of
- * request list. Stops marking at the first non-write request found. If
- * file is opened with O_APPEND, make sure all the writes marked for winding
- * will fit into a single write call to server.
- */
-size_t
-__wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds)
+int
+wb_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- wb_request_t *request = NULL;
- size_t size = 0;
- char first_request = 1;
- off_t offset_expected = 0;
- wb_conf_t *conf = NULL;
- int count = 0;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- conf = file->this->private;
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- list_for_each_entry (request, list, list)
- {
- if ((request->stub == NULL)
- || (request->stub->fop != GF_FOP_WRITE)) {
- break;
- }
+ stub = fop_flush_stub(frame, wb_flush_helper, fd, xdata);
+ if (!stub)
+ goto unwind;
- if (!request->flags.write_request.stack_wound) {
- if (first_request) {
- first_request = 0;
- offset_expected = request->stub->args.writev.off;
- }
-
- if (request->stub->args.writev.off != offset_expected) {
- break;
- }
-
- if ((file->flags & O_APPEND)
- && (((size + request->write_size)
- > conf->aggregate_size)
- || ((count + request->stub->args.writev.count)
- > MAX_VECTOR_COUNT))) {
- break;
- }
-
- size += request->write_size;
- offset_expected += request->write_size;
- file->aggregate_current -= request->write_size;
- count += request->stub->args.writev.count;
-
- request->flags.write_request.stack_wound = 1;
- list_add_tail (&request->winds, winds);
- }
- }
-
- return size;
-}
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
+ wb_process_queue(wb_inode);
-void
-__wb_can_wind (list_head_t *list, char *other_fop_in_queue,
- char *non_contiguous_writes, char *incomplete_writes,
- char *wind_all)
-{
- wb_request_t *request = NULL;
- char first_request = 1;
- off_t offset_expected = 0;
+ return 0;
- list_for_each_entry (request, list, list)
- {
- if ((request->stub == NULL)
- || (request->stub->fop != GF_FOP_WRITE)) {
- if (request->stub && other_fop_in_queue) {
- *other_fop_in_queue = 1;
- }
- break;
- }
+unwind:
+ STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL);
- if (request->flags.write_request.stack_wound
- && !request->flags.write_request.got_reply
- && (incomplete_writes != NULL)) {
- *incomplete_writes = 1;
- break;
- }
+ if (stub)
+ call_stub_destroy(stub);
- if (!request->flags.write_request.stack_wound) {
- if (first_request) {
- first_request = 0;
- offset_expected
- = request->stub->args.writev.off;
- if (wind_all != NULL) {
- *wind_all = request->flags.write_request.flush_all;
- }
- }
-
- if (offset_expected != request->stub->args.writev.off) {
- if (non_contiguous_writes) {
- *non_contiguous_writes = 1;
- }
- break;
- }
-
- offset_expected += request->write_size;
- }
- }
+ return 0;
- return;
+noqueue:
+ STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
}
+int
+wb_fsync_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+ return 0;
+}
-ssize_t
-__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf,
- char enable_trickling_writes)
+int
+wb_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- size_t size = 0;
- char other_fop_in_queue = 0;
- char incomplete_writes = 0;
- char non_contiguous_writes = 0;
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
- char wind_all = 0;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_errno = EINVAL;
- if (list_empty (list)) {
- goto out;
- }
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- request = list_entry (list->next, typeof (*request), list);
- file = request->file;
+ stub = fop_fsync_stub(frame, wb_fsync_helper, fd, datasync, xdata);
+ if (!stub)
+ goto unwind;
- __wb_can_wind (list, &other_fop_in_queue,
- &non_contiguous_writes, &incomplete_writes, &wind_all);
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- if (!incomplete_writes && ((enable_trickling_writes)
- || (wind_all) || (non_contiguous_writes)
- || (other_fop_in_queue)
- || (file->aggregate_current
- >= aggregate_conf))) {
- size = __wb_mark_wind_all (file, list, winds);
- }
+ wb_process_queue(wb_inode);
-out:
- return size;
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
+
+noqueue:
+ STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+ return 0;
}
+int
+wb_stat_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+}
-size_t
-__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size)
+int
+wb_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- size_t written_behind = 0;
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- if (list_empty (list)) {
- goto out;
- }
+ wb_inode = wb_inode_ctx_get(this, loc->inode);
+ if (!wb_inode)
+ goto noqueue;
- request = list_entry (list->next, typeof (*request), list);
- file = request->file;
+ stub = fop_stat_stub(frame, wb_stat_helper, loc, xdata);
+ if (!stub)
+ goto unwind;
- list_for_each_entry (request, list, list)
- {
- if ((request->stub == NULL)
- || (request->stub->fop != GF_FOP_WRITE)) {
- continue;
- }
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- if (written_behind <= size) {
- if (!request->flags.write_request.write_behind) {
- written_behind += request->write_size;
- request->flags.write_request.write_behind = 1;
- list_add_tail (&request->unwinds, unwinds);
-
- if (!request->flags.write_request.got_reply) {
- file->window_current += request->write_size;
- }
- }
- } else {
- break;
- }
- }
+ wb_process_queue(wb_inode);
-out:
- return written_behind;
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
+
+noqueue:
+ STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
}
+int
+wb_fstat_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
-void
-__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds)
+int
+wb_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- wb_request_t *request = NULL;
- wb_file_t *file = NULL;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- if (list_empty (list)) {
- goto out;
- }
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- request = list_entry (list->next, typeof (*request), list);
- file = request->file;
+ stub = fop_fstat_stub(frame, wb_fstat_helper, fd, xdata);
+ if (!stub)
+ goto unwind;
- if (file->window_current <= file->window_conf) {
- __wb_mark_unwind_till (list, unwinds,
- file->window_conf - file->window_current);
- }
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
-out:
- return;
-}
+ wb_process_queue(wb_inode);
+ return 0;
-uint32_t
-__wb_get_other_requests (list_head_t *list, list_head_t *other_requests)
-{
- wb_request_t *request = NULL;
- uint32_t count = 0;
- list_for_each_entry (request, list, list) {
- if ((request->stub == NULL)
- || (request->stub->fop == GF_FOP_WRITE)) {
- break;
- }
-
- if (!request->flags.other_requests.marked_for_resume) {
- request->flags.other_requests.marked_for_resume = 1;
- list_add_tail (&request->other_requests,
- other_requests);
- count++;
- }
- }
+unwind:
+ STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL);
- return count;
-}
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
+noqueue:
+ STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
int32_t
-wb_stack_unwind (list_head_t *unwinds)
+wb_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- struct iatt buf = {0,};
- wb_request_t *request = NULL, *dummy = NULL;
- call_frame_t *frame = NULL;
- wb_local_t *local = NULL;
- int ret = 0, write_requests_removed = 0;
+ GF_ASSERT(frame->local);
- list_for_each_entry_safe (request, dummy, unwinds, unwinds)
- {
- frame = request->stub->frame;
- local = frame->local;
+ if (op_ret == 0)
+ wb_set_inode_size(frame->local, postbuf);
- STACK_UNWIND (frame, local->op_ret, local->op_errno, &buf,
- &buf);
+ frame->local = NULL;
- ret = wb_request_unref (request);
- if (ret == 0) {
- write_requests_removed++;
- }
- }
+ STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
- return write_requests_removed;
+int
+wb_truncate_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND(frame, wb_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
}
+int
+wb_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
+
+ wb_inode = wb_inode_create(this, loc->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ frame->local = wb_inode;
+
+ stub = fop_truncate_stub(frame, wb_truncate_helper, loc, offset, xdata);
+ if (!stub)
+ goto unwind;
+
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
+
+ wb_process_queue(wb_inode);
+
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+
+ return 0;
+}
int32_t
-wb_resume_other_requests (call_frame_t *frame, wb_file_t *file,
- list_head_t *other_requests)
+wb_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- int32_t ret = 0;
- wb_request_t *request = NULL, *dummy = NULL;
- int32_t fops_removed = 0;
- char wind = 0;
- call_stub_t *stub = NULL;
+ GF_ASSERT(frame->local);
- if (list_empty (other_requests)) {
- goto out;
- }
+ if (op_ret == 0)
+ wb_set_inode_size(frame->local, postbuf);
- list_for_each_entry_safe (request, dummy, other_requests,
- other_requests) {
- wind = request->stub->wind;
- stub = request->stub;
-
- LOCK (&file->lock);
- {
- request->stub = NULL;
- }
- UNLOCK (&file->lock);
-
- if (!wind) {
- wb_request_unref (request);
- fops_removed++;
- }
-
- call_resume (stub);
- }
+ frame->local = NULL;
- if (fops_removed > 0) {
- ret = wb_process_queue (frame, file);
- }
-
-out:
- return ret;
+ STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
}
+int
+wb_ftruncate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, wb_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
-int32_t
-wb_do_ops (call_frame_t *frame, wb_file_t *file, list_head_t *winds,
- list_head_t *unwinds, list_head_t *other_requests)
+int
+wb_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- int32_t ret = -1, write_requests_removed = 0;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_errno = 0;
- ret = wb_stack_unwind (unwinds);
+ wb_inode = wb_inode_create(this, fd->inode);
+ if (!wb_inode) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
- write_requests_removed = ret;
+ frame->local = wb_inode;
- ret = wb_sync (frame, file, winds);
- if (ret == -1) {
- goto out;
- }
+ stub = fop_ftruncate_stub(frame, wb_ftruncate_helper, fd, offset, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
- wb_resume_other_requests (frame, file, other_requests);
+ if (!wb_enqueue(wb_inode, stub)) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
- /* wb_stack_unwind does wb_request_unref after unwinding a write
- * request. Hence if a write-request was just freed in wb_stack_unwind,
- * we have to process request queue once again to unblock requests
- * blocked on the writes just unwound.
- */
- if (write_requests_removed > 0) {
- ret = wb_process_queue (frame, file);
- }
+ wb_process_queue(wb_inode);
-out:
- return ret;
+ return 0;
+
+unwind:
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
}
+int
+wb_setattr_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+ return 0;
+}
-inline int
-__wb_copy_into_holder (wb_request_t *holder, wb_request_t *request)
+int
+wb_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
{
- char *ptr = NULL;
- struct iobuf *iobuf = NULL;
- struct iobref *iobref = NULL;
- int ret = -1;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- if (holder->flags.write_request.virgin) {
- iobuf = iobuf_get (request->file->this->ctx->iobuf_pool);
- if (iobuf == NULL) {
- gf_log (request->file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
+ wb_inode = wb_inode_ctx_get(this, loc->inode);
+ if (!wb_inode)
+ goto noqueue;
- iobref = iobref_new ();
- if (iobref == NULL) {
- iobuf_unref (iobuf);
- gf_log (request->file->this->name, GF_LOG_ERROR,
- "out of memory");
- goto out;
- }
-
- ret = iobref_add (iobref, iobuf);
- if (ret != 0) {
- iobuf_unref (iobuf);
- iobref_unref (iobref);
- gf_log (request->file->this->name, GF_LOG_DEBUG,
- "cannot add iobuf (%p) into iobref (%p)",
- iobuf, iobref);
- goto out;
- }
-
- iov_unload (iobuf->ptr, holder->stub->args.writev.vector,
- holder->stub->args.writev.count);
- holder->stub->args.writev.vector[0].iov_base = iobuf->ptr;
-
- iobref_unref (holder->stub->args.writev.iobref);
- holder->stub->args.writev.iobref = iobref;
-
- iobuf_unref (iobuf);
-
- holder->flags.write_request.virgin = 0;
- }
+ stub = fop_setattr_stub(frame, wb_setattr_helper, loc, stbuf, valid, xdata);
+ if (!stub)
+ goto unwind;
- ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size;
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- iov_unload (ptr,
- request->stub->args.writev.vector,
- request->stub->args.writev.count);
+ wb_process_queue(wb_inode);
- holder->stub->args.writev.vector[0].iov_len += request->write_size;
- holder->write_size += request->write_size;
+ return 0;
+unwind:
+ STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
- request->flags.write_request.stack_wound = 1;
- list_move_tail (&request->list, &request->file->passive_requests);
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
- ret = 0;
-out:
- return ret;
+noqueue:
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+ return 0;
}
+int
+wb_fsetattr_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+ return 0;
+}
-/* this procedure assumes that write requests have only one vector to write */
-void
-__wb_collapse_write_bufs (list_head_t *requests, size_t page_size)
-{
- off_t offset_expected = 0;
- size_t space_left = 0;
- wb_request_t *request = NULL, *tmp = NULL, *holder = NULL;
- int ret = 0;
-
- list_for_each_entry_safe (request, tmp, requests, list) {
- if ((request->stub == NULL)
- || (request->stub->fop != GF_FOP_WRITE)
- || (request->flags.write_request.stack_wound)) {
- holder = NULL;
- continue;
- }
+int
+wb_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- if (request->flags.write_request.write_behind) {
- if (holder == NULL) {
- holder = request;
- continue;
- }
-
- offset_expected = holder->stub->args.writev.off
- + holder->write_size;
-
- if (request->stub->args.writev.off != offset_expected) {
- holder = request;
- continue;
- }
-
- space_left = page_size - holder->write_size;
-
- if (space_left >= request->write_size) {
- ret = __wb_copy_into_holder (holder, request);
- if (ret != 0) {
- break;
- }
-
- __wb_request_unref (request);
- } else {
- holder = request;
- }
- } else {
- break;
- }
- }
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- return;
-}
+ stub = fop_fsetattr_stub(frame, wb_fsetattr_helper, fd, stbuf, valid,
+ xdata);
+ if (!stub)
+ goto unwind;
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
-int32_t
-wb_process_queue (call_frame_t *frame, wb_file_t *file)
-{
- list_head_t winds, unwinds, other_requests;
- size_t size = 0;
- wb_conf_t *conf = NULL;
- uint32_t count = 0;
- int32_t ret = -1;
+ wb_process_queue(wb_inode);
- INIT_LIST_HEAD (&winds);
- INIT_LIST_HEAD (&unwinds);
- INIT_LIST_HEAD (&other_requests);
-
- if (file == NULL) {
- errno = EINVAL;
- goto out;
- }
+ return 0;
+unwind:
+ STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
- conf = file->this->private;
- size = conf->aggregate_size;
- LOCK (&file->lock);
- {
- /*
- * make sure requests are marked for unwinding and adjacent
- * continguous write buffers (each of size less than that of
- * an iobuf) are packed properly so that iobufs are filled to
- * their maximum capacity, before calling __wb_mark_winds.
- */
- __wb_mark_unwinds (&file->request, &unwinds);
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
- __wb_collapse_write_bufs (&file->request,
- file->this->ctx->page_size);
+noqueue:
+ STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+ return 0;
+}
- count = __wb_get_other_requests (&file->request,
- &other_requests);
+int32_t
+wb_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
- if (count == 0) {
- __wb_mark_winds (&file->request, &winds, size,
- conf->enable_trickling_writes);
- }
+ wb_inode = wb_inode_create(this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
- }
- UNLOCK (&file->lock);
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
- ret = wb_do_ops (frame, file, &winds, &unwinds, &other_requests);
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
-out:
- return ret;
+unwind:
+ STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
+int32_t
+wb_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create(this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
int32_t
-wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf)
+wb_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
{
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf);
- return 0;
+ if (op_ret == 0) {
+ wb_inode_t *wb_inode = wb_inode_ctx_get(this, inode);
+ if (wb_inode)
+ wb_set_inode_size(wb_inode, buf);
+ }
+
+ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+ postparent);
+ return 0;
}
+int
+wb_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
int32_t
-wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
- int32_t count, off_t offset, struct iobref *iobref)
-{
- wb_file_t *file = NULL;
- char wb_disabled = 0;
- call_frame_t *process_frame = NULL;
- size_t size = 0;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_local_t *local = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1;
- int32_t op_ret = -1, op_errno = EINVAL;
-
- if (vector != NULL)
- size = iov_length (vector, count);
-
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
-
- op_errno = EBADFD;
- goto unwind;
- }
+wb_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- file = (wb_file_t *)(long)tmp_file;
- if ((!IA_ISDIR (fd->inode->ia_type)) && (file == NULL)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "wb_file not found for fd %p", fd);
- op_errno = EBADFD;
- goto unwind;
- }
+ wb_inode = wb_inode_ctx_get(this, loc->inode);
+ if (!wb_inode)
+ goto noqueue;
- if (file != NULL) {
- LOCK (&file->lock);
- {
- op_ret = file->op_ret;
- op_errno = file->op_errno;
-
- file->op_ret = 0;
-
- if ((op_ret == 0)
- && (file->disabled || file->disable_till)) {
- if (size > file->disable_till) {
- file->disable_till = 0;
- } else {
- file->disable_till -= size;
- }
- wb_disabled = 1;
- }
- }
- UNLOCK (&file->lock);
- } else {
- wb_disabled = 1;
- }
+ stub = fop_lookup_stub(frame, wb_lookup_helper, loc, xdata);
+ if (!stub)
+ goto unwind;
- if (op_ret == -1) {
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
- NULL, NULL);
- return 0;
- }
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- if (wb_disabled) {
- STACK_WIND (frame, wb_writev_cbk,
- FIRST_CHILD (frame->this),
- FIRST_CHILD (frame->this)->fops->writev,
- fd, vector, count, offset, iobref);
- return 0;
- }
+ wb_process_queue(wb_inode);
- process_frame = copy_frame (frame);
- if (process_frame == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+ return 0;
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- return 0;
- }
+unwind:
+ if (stub)
+ call_stub_destroy(stub);
- frame->local = local;
- local->file = file;
+ STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+ return 0;
- stub = fop_writev_stub (frame, NULL, fd, vector, count, offset,
- iobref);
- if (stub == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
+noqueue:
+ STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
-
- ret = wb_process_queue (process_frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_errno = ENOMEM;
- goto unwind;
- }
+static void
+wb_mark_readdirp_start(xlator_t *this, inode_t *directory)
+{
+ wb_inode_t *wb_directory_inode = NULL;
- STACK_DESTROY (process_frame->root);
+ wb_directory_inode = wb_inode_create(this, directory);
- return 0;
+ if (!wb_directory_inode)
+ return;
-unwind:
- STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL);
+ LOCK(&wb_directory_inode->lock);
+ {
+ GF_ATOMIC_INC(wb_directory_inode->readdirps);
+ }
+ UNLOCK(&wb_directory_inode->lock);
- if (process_frame) {
- STACK_DESTROY (process_frame->root);
- }
+ return;
+}
+
+static void
+wb_mark_readdirp_end(xlator_t *this, inode_t *directory)
+{
+ wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL;
+ int readdirps = 0;
+
+ wb_directory_inode = wb_inode_ctx_get(this, directory);
+
+ if (!wb_directory_inode)
+ return;
- if (stub) {
- call_stub_destroy (stub);
+ LOCK(&wb_directory_inode->lock);
+ {
+ readdirps = GF_ATOMIC_DEC(wb_directory_inode->readdirps);
+ if (readdirps)
+ goto unlock;
+
+ list_for_each_entry_safe(wb_inode, tmp,
+ &wb_directory_inode->invalidate_list,
+ invalidate_list)
+ {
+ list_del_init(&wb_inode->invalidate_list);
+ GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+ inode_unref(wb_inode->inode);
}
+ }
+unlock:
+ UNLOCK(&wb_directory_inode->lock);
- return 0;
+ return;
}
-
int32_t
-wb_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vector, int32_t count,
- struct iatt *stbuf, struct iobref *iobref)
-{
- wb_local_t *local = NULL;
- wb_file_t *file = NULL;
- wb_request_t *request = NULL;
- int32_t ret = 0;
-
- local = frame->local;
- file = local->file;
- request = local->request;
-
- if ((request != NULL) && (file != NULL)) {
- wb_request_unref (request);
-
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
+wb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ gf_dirent_t *entry = NULL;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+
+ fd = frame->local;
+ frame->local = NULL;
+
+ if (op_ret <= 0)
+ goto unwind;
+
+ list_for_each_entry(entry, &entries->list, list)
+ {
+ if (!entry->inode || !IA_ISREG(entry->d_stat.ia_type))
+ continue;
+
+ wb_inode = wb_inode_ctx_get(this, entry->inode);
+ if (!wb_inode)
+ continue;
+
+ LOCK(&wb_inode->lock);
+ {
+ if (!list_empty(&wb_inode->liability) ||
+ GF_ATOMIC_GET(wb_inode->invalidate)) {
+ inode = entry->inode;
+
+ entry->inode = NULL;
+ memset(&entry->d_stat, 0, sizeof(entry->d_stat));
+ }
}
+ UNLOCK(&wb_inode->lock);
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, stbuf, iobref);
+ if (inode) {
+ inode_unref(inode);
+ inode = NULL;
+ }
+ }
- return 0;
-}
+unwind:
+ wb_mark_readdirp_end(this, fd->inode);
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
-static int32_t
-wb_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
+int32_t
+wb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
{
- STACK_WIND (frame,
- wb_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset);
-
- return 0;
+ wb_mark_readdirp_start(this, fd->inode);
+
+ frame->local = fd;
+
+ STACK_WIND(frame, wb_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+ return 0;
}
+int32_t
+wb_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
int32_t
-wb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset)
-{
- wb_file_t *file = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- int32_t ret = -1;
- wb_request_t *request = NULL;
-
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
-
- STACK_UNWIND_STRICT (readv, frame, -1, EBADFD,
- NULL, 0, NULL, NULL);
- return 0;
- }
+wb_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- file = (wb_file_t *)(long)tmp_file;
+ wb_inode = wb_inode_ctx_get(this, oldloc->inode);
+ if (!wb_inode)
+ goto noqueue;
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM,
- NULL, 0, NULL, NULL);
- return 0;
- }
+ stub = fop_link_stub(frame, wb_link_helper, oldloc, newloc, xdata);
+ if (!stub)
+ goto unwind;
- local->file = file;
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- frame->local = local;
- if (file) {
- stub = fop_readv_stub (frame, wb_readv_helper, fd, size,
- offset);
- if (stub == NULL) {
- STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM,
- NULL, 0, NULL, NULL);
- return 0;
- }
+ wb_process_queue(wb_inode);
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM,
- NULL, 0, NULL, NULL);
- call_stub_destroy (stub);
- return 0;
- }
+ return 0;
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM,
- NULL, 0, NULL, NULL);
- call_stub_destroy (stub);
- return 0;
- }
+unwind:
+ STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
- } else {
- STACK_WIND (frame,
- wb_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset);
- }
+ if (stub)
+ call_stub_destroy(stub);
- return 0;
-}
+ return 0;
+noqueue:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
int32_t
-wb_ffr_bg_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+wb_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t keep_size, off_t offset, size_t len, dict_t *xdata)
{
- STACK_DESTROY (frame->root);
- return 0;
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset,
+ len, xdata);
+ return 0;
}
-
int32_t
-wb_ffr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno)
+wb_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+ off_t offset, size_t len, dict_t *xdata)
{
- wb_local_t *local = NULL;
- wb_file_t *file = NULL;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- local = frame->local;
- file = local->file;
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- if (file != NULL) {
- LOCK (&file->lock);
- {
- if (file->op_ret == -1) {
- op_ret = file->op_ret;
- op_errno = file->op_errno;
+ stub = fop_fallocate_stub(frame, wb_fallocate_helper, fd, keep_size, offset,
+ len, xdata);
+ if (!stub)
+ goto unwind;
- file->op_ret = 0;
- }
- }
- UNLOCK (&file->lock);
- }
-
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno);
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- return 0;
+ wb_process_queue(wb_inode);
+
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+
+ return 0;
+
+noqueue:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset,
+ len, xdata);
+ return 0;
}
+int32_t
+wb_discard_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+}
int32_t
-wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd)
+wb_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
{
- wb_conf_t *conf = NULL;
- wb_local_t *local = NULL;
- wb_file_t *file = NULL;
- call_frame_t *flush_frame = NULL, *process_frame = NULL;
- int32_t op_ret = -1, op_errno = -1, ret = -1;
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- conf = this->private;
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
- local = frame->local;
- file = local->file;
+ stub = fop_discard_stub(frame, wb_discard_helper, fd, offset, len, xdata);
+ if (!stub)
+ goto unwind;
- LOCK (&file->lock);
- {
- op_ret = file->op_ret;
- op_errno = file->op_errno;
- }
- UNLOCK (&file->lock);
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- if (local && local->request) {
- process_frame = copy_frame (frame);
- if (process_frame == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto unwind;
- }
+ wb_process_queue(wb_inode);
- wb_request_unref (local->request);
- }
-
- if (conf->flush_behind) {
- flush_frame = copy_frame (frame);
- if (flush_frame == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory");
- goto unwind;
- }
+ return 0;
- STACK_WIND (flush_frame,
- wb_ffr_bg_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- } else {
- STACK_WIND (frame,
- wb_ffr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- }
+unwind:
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
- if (process_frame != NULL) {
- ret = wb_process_queue (process_frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- STACK_DESTROY (process_frame->root);
- goto unwind;
- }
+ if (stub)
+ call_stub_destroy(stub);
+ return 0;
- STACK_DESTROY (process_frame->root);
- }
+noqueue:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
- if (conf->flush_behind) {
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno);
- }
+ return 0;
+}
- return 0;
+int32_t
+wb_zerofill_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill,
+ fd, offset, len, xdata);
+ return 0;
+}
+
+int32_t
+wb_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
+
+ wb_inode = wb_inode_ctx_get(this, fd->inode);
+ if (!wb_inode)
+ goto noqueue;
+
+ stub = fop_zerofill_stub(frame, wb_zerofill_helper, fd, offset, len, xdata);
+ if (!stub)
+ goto unwind;
+
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
+
+ wb_process_queue(wb_inode);
+
+ return 0;
unwind:
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- return 0;
-}
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ if (stub)
+ call_stub_destroy(stub);
+noqueue:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill,
+ fd, offset, len, xdata);
+ return 0;
+}
int32_t
-wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
-{
- wb_conf_t *conf = NULL;
- wb_file_t *file = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- call_frame_t *flush_frame = NULL;
- wb_request_t *request = NULL;
- int32_t ret = 0;
-
- conf = this->private;
-
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
-
- STACK_UNWIND_STRICT (flush, frame, -1, EBADFD);
- return 0;
- }
+wb_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+ call_stub_t *stub = NULL;
- file = (wb_file_t *)(long)tmp_file;
+ wb_inode = wb_inode_ctx_get(this, oldloc->inode);
+ if (!wb_inode)
+ goto noqueue;
- if (file != NULL) {
- local = GF_CALLOC (1, sizeof (*local), gf_wb_mt_wb_local_t);
- if (local == NULL) {
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- return 0;
- }
+ stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata);
+ if (!stub)
+ goto unwind;
- local->file = file;
+ if (!wb_enqueue(wb_inode, stub))
+ goto unwind;
- frame->local = local;
+ wb_process_queue(wb_inode);
- stub = fop_flush_stub (frame, wb_flush_helper, fd);
- if (stub == NULL) {
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- return 0;
- }
+ return 0;
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- call_stub_destroy (stub);
- return 0;
- }
+unwind:
+ if (stub)
+ call_stub_destroy(stub);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- call_stub_destroy (stub);
- return 0;
- }
- } else {
- if (conf->flush_behind) {
- flush_frame = copy_frame (frame);
- if (flush_frame == NULL) {
- STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM);
- return 0;
- }
-
- STACK_UNWIND_STRICT (flush, frame, 0, 0);
-
- STACK_WIND (flush_frame,
- wb_ffr_bg_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- } else {
- STACK_WIND (frame,
- wb_ffr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd);
- }
- }
+ STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
-}
+ return 0;
+noqueue:
+ STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
-static int32_t
-wb_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf)
+int
+wb_forget(xlator_t *this, inode_t *inode)
{
- wb_local_t *local = NULL;
- wb_file_t *file = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1;
+ uint64_t tmp = 0;
+ wb_inode_t *wb_inode = NULL;
- local = frame->local;
- file = local->file;
- request = local->request;
+ inode_ctx_del(inode, this, &tmp);
- if (file != NULL) {
- LOCK (&file->lock);
- {
- if (file->op_ret == -1) {
- op_ret = file->op_ret;
- op_errno = file->op_errno;
+ wb_inode = (wb_inode_t *)(long)tmp;
- file->op_ret = 0;
- }
- }
- UNLOCK (&file->lock);
-
- if (request) {
- wb_request_unref (request);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- op_ret = -1;
- op_errno = ENOMEM;
- }
- }
+ if (!wb_inode)
+ return 0;
- }
+ wb_inode_destroy(wb_inode);
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf);
-
- return 0;
+ return 0;
}
+int
+wb_release(xlator_t *this, fd_t *fd)
+{
+ uint64_t tmp = 0;
+
+ (void)fd_ctx_del(fd, this, &tmp);
+
+ return 0;
+}
-static int32_t
-wb_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
+int
+wb_priv_dump(xlator_t *this)
{
- STACK_WIND (frame,
- wb_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd, datasync);
- return 0;
+ wb_conf_t *conf = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+ 0,
+ };
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("write-behind", this, out);
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+ gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind",
+ "priv");
+
+ gf_proc_dump_add_section("%s", key_prefix);
+
+ gf_proc_dump_write("aggregate_size", "%" PRIu64, conf->aggregate_size);
+ gf_proc_dump_write("window_size", "%" PRIu64, conf->window_size);
+ gf_proc_dump_write("flush_behind", "%d", conf->flush_behind);
+ gf_proc_dump_write("trickling_writes", "%d", conf->trickling_writes);
+
+ ret = 0;
+out:
+ return ret;
}
+void
+__wb_dump_requests(struct list_head *head, char *prefix)
+{
+ char key[GF_DUMP_MAX_BUF_LEN] = {
+ 0,
+ };
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] =
+ {
+ 0,
+ },
+ flag = 0;
+ wb_request_t *req = NULL;
-int32_t
-wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync)
-{
- wb_file_t *file = NULL;
- wb_local_t *local = NULL;
- uint64_t tmp_file = 0;
- call_stub_t *stub = NULL;
- wb_request_t *request = NULL;
- int32_t ret = -1;
-
- if ((!IA_ISDIR (fd->inode->ia_type))
- && fd_ctx_get (fd, this, &tmp_file)) {
- gf_log (this->name, GF_LOG_DEBUG, "write behind file pointer is"
- " not stored in context of fd(%p), returning EBADFD",
- fd);
-
- STACK_UNWIND_STRICT (fsync, frame, -1, EBADFD, NULL, NULL);
- return 0;
- }
+ list_for_each_entry(req, head, all)
+ {
+ gf_proc_dump_build_key(key_prefix, key, "%s",
+ (char *)gf_fop_list[req->fop]);
- file = (wb_file_t *)(long)tmp_file;
+ gf_proc_dump_add_section("%s", key_prefix);
- local = GF_CALLOC (1, sizeof (*local),
- gf_wb_mt_wb_local_t);
- if (local == NULL) {
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, NULL, NULL);
- return 0;
- }
+ gf_proc_dump_write("unique", "%" PRIu64, req->unique);
- local->file = file;
+ gf_proc_dump_write("refcount", "%d", req->refcount);
- frame->local = local;
+ if (list_empty(&req->todo))
+ gf_proc_dump_write("wound", "yes");
+ else
+ gf_proc_dump_write("wound", "no");
- if (file) {
- stub = fop_fsync_stub (frame, wb_fsync_helper, fd, datasync);
- if (stub == NULL) {
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM,
- NULL, NULL);
- return 0;
- }
-
- request = wb_enqueue (file, stub);
- if (request == NULL) {
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM,
- NULL, NULL);
- call_stub_destroy (stub);
- return 0;
- }
+ gf_proc_dump_write("generation-number", "%" PRIu64, req->gen);
- ret = wb_process_queue (frame, file);
- if ((ret == -1) && (errno == ENOMEM)) {
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM,
- NULL, NULL);
- call_stub_destroy (stub);
- return 0;
- }
-
- } else {
- STACK_WIND (frame,
- wb_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd, datasync);
- }
+ gf_proc_dump_write("req->op_ret", "%d", req->op_ret);
+ gf_proc_dump_write("req->op_errno", "%d", req->op_errno);
+ gf_proc_dump_write("sync-attempts", "%d", req->wind_count);
- return 0;
-}
+ if (req->fop == GF_FOP_WRITE) {
+ if (list_empty(&req->wip))
+ gf_proc_dump_write("sync-in-progress", "no");
+ else
+ gf_proc_dump_write("sync-in-progress", "yes");
+ gf_proc_dump_write("size", "%" GF_PRI_SIZET, req->write_size);
-int32_t
-wb_release (xlator_t *this, fd_t *fd)
-{
- uint64_t file_ptr = 0;
- wb_file_t *file = NULL;
+ if (req->stub)
+ gf_proc_dump_write("offset", "%" PRId64,
+ req->stub->args.offset);
- fd_ctx_get (fd, this, &file_ptr);
- file = (wb_file_t *) (long) file_ptr;
+ flag = req->ordering.lied;
+ gf_proc_dump_write("lied", "%d", flag);
- if (file != NULL) {
- LOCK (&file->lock);
- {
- assert (list_empty (&file->request));
- }
- UNLOCK (&file->lock);
+ flag = req->ordering.append;
+ gf_proc_dump_write("append", "%d", flag);
- wb_file_destroy (file);
- }
+ flag = req->ordering.fulfilled;
+ gf_proc_dump_write("fulfilled", "%d", flag);
- return 0;
+ flag = req->ordering.go;
+ gf_proc_dump_write("go", "%d", flag);
+ }
+ }
}
int
-wb_priv_dump (xlator_t *this)
+wb_inode_dump(xlator_t *this, inode_t *inode)
{
- wb_conf_t *conf = NULL;
- char key[GF_DUMP_MAX_BUF_LEN];
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ wb_inode_t *wb_inode = NULL;
+ int32_t ret = -1;
+ char *path = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+ 0,
+ };
+ char uuid_str[64] = {
+ 0,
+ };
+
+ if ((inode == NULL) || (this == NULL)) {
+ ret = 0;
+ goto out;
+ }
- if (!this)
- return -1;
+ wb_inode = wb_inode_ctx_get(this, inode);
+ if (wb_inode == NULL) {
+ ret = 0;
+ goto out;
+ }
- conf = this->private;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING,
- "conf null in xlator");
- return -1;
- }
+ uuid_utoa_r(inode->gfid, uuid_str);
- gf_proc_dump_build_key (key_prefix,
- "xlator.performance.write-behind",
- "priv");
-
- gf_proc_dump_add_section (key_prefix);
-
- gf_proc_dump_build_key (key, key_prefix, "aggregate_size");
- gf_proc_dump_write (key, "%d", conf->aggregate_size);
- gf_proc_dump_build_key (key, key_prefix, "window_size");
- gf_proc_dump_write (key, "%d", conf->window_size);
- gf_proc_dump_build_key (key, key_prefix, "disable_till");
- gf_proc_dump_write (key, "%d", conf->disable_till);
- gf_proc_dump_build_key (key, key_prefix, "enable_O_SYNC");
- gf_proc_dump_write (key, "%d", conf->enable_O_SYNC);
- gf_proc_dump_build_key (key, key_prefix, "flush_behind");
- gf_proc_dump_write (key, "%d", conf->flush_behind);
- gf_proc_dump_build_key (key, key_prefix, "enable_trickling_writes");
- gf_proc_dump_write (key, "%d", conf->enable_trickling_writes);
+ gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind",
+ "wb_inode");
- return 0;
-}
+ gf_proc_dump_add_section("%s", key_prefix);
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
+ __inode_path(inode, NULL, &path);
+ if (path != NULL) {
+ gf_proc_dump_write("path", "%s", path);
+ GF_FREE(path);
+ }
- if (!this)
- return ret;
+ gf_proc_dump_write("inode", "%p", inode);
- ret = xlator_mem_acct_init (this, gf_wb_mt_end + 1);
+ gf_proc_dump_write("gfid", "%s", uuid_str);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
+ gf_proc_dump_write("window_conf", "%" GF_PRI_SIZET, wb_inode->window_conf);
+
+ gf_proc_dump_write("window_current", "%" GF_PRI_SIZET,
+ wb_inode->window_current);
+
+ gf_proc_dump_write("transit-size", "%" GF_PRI_SIZET, wb_inode->transit);
+
+ gf_proc_dump_write("dontsync", "%d", wb_inode->dontsync);
+
+ ret = TRY_LOCK(&wb_inode->lock);
+ if (!ret) {
+ if (!list_empty(&wb_inode->all)) {
+ __wb_dump_requests(&wb_inode->all, key_prefix);
}
+ UNLOCK(&wb_inode->lock);
+ }
- return ret;
+ if (ret && wb_inode)
+ gf_proc_dump_write("Unable to dump the inode information",
+ "(Lock acquisition failed) %p (gfid: %s)", wb_inode,
+ uuid_str);
+
+ ret = 0;
+out:
+ return ret;
}
-int32_t
-init (xlator_t *this)
+int
+mem_acct_init(xlator_t *this)
{
- dict_t *options = NULL;
- wb_conf_t *conf = NULL;
- char *str = NULL;
- int32_t ret = -1;
+ int ret = -1;
- if ((this->children == NULL)
- || this->children->next) {
- gf_log (this->name, GF_LOG_ERROR,
- "FATAL: write-behind (%s) not configured with exactly "
- "one child",
- this->name);
- return -1;
- }
+ if (!this) {
+ goto out;
+ }
- if (this->parents == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- options = this->options;
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_wb_mt_wb_conf_t);
- if (conf == NULL) {
- gf_log (this->name, GF_LOG_ERROR,
- "FATAL: Out of memory");
- return -1;
- }
-
- conf->enable_O_SYNC = _gf_false;
- ret = dict_get_str (options, "enable-O_SYNC",
- &str);
- if (ret == 0) {
- ret = gf_string2boolean (str,
- &conf->enable_O_SYNC);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "'enable-O_SYNC' takes only boolean arguments");
- return -1;
- }
- }
+ ret = xlator_mem_acct_init(this, gf_wb_mt_end + 1);
- /* configure 'options aggregate-size <size>' */
- conf->aggregate_size = WB_AGGREGATE_SIZE;
- conf->disable_till = 0;
- ret = dict_get_str (options, "disable-for-first-nbytes",
- &str);
- if (ret == 0) {
- ret = gf_string2bytesize (str,
- &conf->disable_till);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "invalid number format \"%s\" of \"option "
- "disable-for-first-nbytes\"",
- str);
- return -1;
- }
- }
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, WRITE_BEHIND_MSG_NO_MEMORY,
+ "Memory accounting init"
+ "failed");
+ }
- gf_log (this->name, GF_LOG_DEBUG,
- "disabling write-behind for first %"PRIu64" bytes",
- conf->disable_till);
-
- /* configure 'option window-size <size>' */
- conf->window_size = WB_WINDOW_SIZE;
- ret = dict_get_str (options, "cache-size",
- &str);
- if (ret == 0) {
- ret = gf_string2bytesize (str,
- &conf->window_size);
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "invalid number format \"%s\" of \"option "
- "window-size\"",
- str);
- GF_FREE (conf);
- return -1;
- }
- }
+out:
+ return ret;
+}
- if (!conf->window_size && conf->aggregate_size) {
- gf_log (this->name, GF_LOG_WARNING,
- "setting window-size to be equal to "
- "aggregate-size(%"PRIu64")",
- conf->aggregate_size);
- conf->window_size = conf->aggregate_size;
- }
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ wb_conf_t *conf = NULL;
+ int ret = -1;
- if (conf->window_size < conf->aggregate_size) {
- gf_log (this->name, GF_LOG_ERROR,
- "aggregate-size(%"PRIu64") cannot be more than "
- "window-size"
- "(%"PRIu64")", conf->aggregate_size, conf->window_size);
- GF_FREE (conf);
- return -1;
- }
+ conf = this->private;
- /* configure 'option flush-behind <on/off>' */
- conf->flush_behind = 1;
- ret = dict_get_str (options, "flush-behind",
- &str);
- if (ret == 0) {
- ret = gf_string2boolean (str,
- &conf->flush_behind);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "'flush-behind' takes only boolean arguments");
- return -1;
- }
+ GF_OPTION_RECONF("cache-size", conf->window_size, options, size_uint64,
+ out);
- if (conf->flush_behind) {
- gf_log (this->name, GF_LOG_DEBUG,
- "enabling flush-behind");
- }
- }
+ GF_OPTION_RECONF("flush-behind", conf->flush_behind, options, bool, out);
- conf->enable_trickling_writes = _gf_true;
- ret = dict_get_str (options, "enable-trickling-writes",
- &str);
- if (ret == 0) {
- ret = gf_string2boolean (str,
- &conf->enable_trickling_writes);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "'enable-trickling_writes' takes only boolean"
- " arguments");
- return -1;
- }
- }
+ GF_OPTION_RECONF("trickling-writes", conf->trickling_writes, options, bool,
+ out);
- this->private = conf;
- return 0;
+ GF_OPTION_RECONF("strict-O_DIRECT", conf->strict_O_DIRECT, options, bool,
+ out);
+
+ GF_OPTION_RECONF("strict-write-ordering", conf->strict_write_ordering,
+ options, bool, out);
+ GF_OPTION_RECONF("resync-failed-syncs-after-fsync",
+ conf->resync_after_fsync, options, bool, out);
+
+ ret = 0;
+out:
+ return ret;
}
+int32_t
+init(xlator_t *this)
+{
+ wb_conf_t *conf = NULL;
+ int32_t ret = -1;
+
+ if ((this->children == NULL) || this->children->next) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_INIT_FAILED,
+ "FATAL: write-behind (%s) not configured with exactly "
+ "one child",
+ this->name);
+ goto out;
+ }
+
+ if (this->parents == NULL) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ WRITE_BEHIND_MSG_VOL_MISCONFIGURED,
+ "dangling volume. check volfilex");
+ }
+
+ conf = GF_CALLOC(1, sizeof(*conf), gf_wb_mt_wb_conf_t);
+ if (conf == NULL) {
+ goto out;
+ }
+
+ /* configure 'options aggregate-size <size>' */
+ GF_OPTION_INIT("aggregate-size", conf->aggregate_size, size_uint64, out);
+ conf->page_size = conf->aggregate_size;
+
+ /* configure 'option window-size <size>' */
+ GF_OPTION_INIT("cache-size", conf->window_size, size_uint64, out);
+
+ if (!conf->window_size && conf->aggregate_size) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_SIZE_NOT_SET,
+ "setting window-size to be equal to "
+ "aggregate-size(%" PRIu64 ")",
+ conf->aggregate_size);
+ conf->window_size = conf->aggregate_size;
+ }
+
+ if (conf->window_size < conf->aggregate_size) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE,
+ "aggregate-size(%" PRIu64
+ ") cannot be more than "
+ "window-size(%" PRIu64 ")",
+ conf->aggregate_size, conf->window_size);
+ goto out;
+ }
+
+ /* configure 'option flush-behind <on/off>' */
+ GF_OPTION_INIT("flush-behind", conf->flush_behind, bool, out);
+
+ GF_OPTION_INIT("trickling-writes", conf->trickling_writes, bool, out);
+
+ GF_OPTION_INIT("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out);
+
+ GF_OPTION_INIT("strict-write-ordering", conf->strict_write_ordering, bool,
+ out);
+
+ GF_OPTION_INIT("resync-failed-syncs-after-fsync", conf->resync_after_fsync,
+ bool, out);
+
+ this->private = conf;
+ ret = 0;
+
+out:
+ if (ret) {
+ GF_FREE(conf);
+ }
+ return ret;
+}
void
-fini (xlator_t *this)
+fini(xlator_t *this)
{
- wb_conf_t *conf = this->private;
+ wb_conf_t *conf = NULL;
- GF_FREE (conf);
- return;
-}
+ GF_VALIDATE_OR_GOTO("write-behind", this, out);
+ conf = this->private;
+ if (!conf) {
+ goto out;
+ }
-struct xlator_fops fops = {
- .writev = wb_writev,
- .open = wb_open,
- .create = wb_create,
- .readv = wb_readv,
- .flush = wb_flush,
- .fsync = wb_fsync,
- .stat = wb_stat,
- .fstat = wb_fstat,
- .truncate = wb_truncate,
- .ftruncate = wb_ftruncate,
- .setattr = wb_setattr,
-};
+ this->private = NULL;
+ GF_FREE(conf);
+out:
+ return;
+}
-struct xlator_cbks cbks = {
- .release = wb_release
+struct xlator_fops fops = {
+ .writev = wb_writev,
+ .readv = wb_readv,
+ .flush = wb_flush,
+ .fsync = wb_fsync,
+ .stat = wb_stat,
+ .fstat = wb_fstat,
+ .truncate = wb_truncate,
+ .ftruncate = wb_ftruncate,
+ .setattr = wb_setattr,
+ .fsetattr = wb_fsetattr,
+ .lookup = wb_lookup,
+ .readdirp = wb_readdirp,
+ .link = wb_link,
+ .fallocate = wb_fallocate,
+ .discard = wb_discard,
+ .zerofill = wb_zerofill,
+ .rename = wb_rename,
};
+struct xlator_cbks cbks = {.forget = wb_forget, .release = wb_release};
+
struct xlator_dumpops dumpops = {
- .priv = wb_priv_dump,
+ .priv = wb_priv_dump,
+ .inodectx = wb_inode_dump,
};
struct volume_options options[] = {
- { .key = {"flush-behind"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"cache-size", "window-size"},
- .type = GF_OPTION_TYPE_SIZET,
- .min = 512 * GF_UNIT_KB,
- .max = 1 * GF_UNIT_GB
- },
- { .key = {"disable-for-first-nbytes"},
- .type = GF_OPTION_TYPE_SIZET,
- .min = 1,
- .max = 1 * GF_UNIT_MB,
- },
- { .key = {"enable-O_SYNC"},
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = {"enable-trickling-writes"},
- .type = GF_OPTION_TYPE_BOOL,
- },
- { .key = {NULL} },
+ {
+ .key = {"write-behind"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "enable/disable write-behind",
+ .op_version = {GD_OP_VERSION_6_0},
+ .flags = OPT_FLAG_SETTABLE,
+ },
+ {.key = {"flush-behind"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .description = "If this option is set ON, instructs write-behind "
+ "translator to perform flush in background, by "
+ "returning success (or any errors, if any of "
+ "previous writes were failed) to application even "
+ "before flush FOP is sent to backend filesystem. "},
+ {.key = {"cache-size", "window-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 512 * GF_UNIT_KB,
+ .max = 1 * GF_UNIT_GB,
+ .default_value = "1MB",
+ .op_version = {1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .description = "Size of the write-behind buffer for a single file "
+ "(inode)."},
+ {
+ .key = {"trickling-writes"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {GD_OP_VERSION_3_13_1},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .default_value = "on",
+ },
+ {.key = {"strict-O_DIRECT"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {2},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .description = "This option when set to off, ignores the "
+ "O_DIRECT flag."},
+ {
+ .key = {"strict-write-ordering"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {2},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .description = "Do not let later writes overtake earlier writes even "
+ "if they do not overlap",
+ },
+ {
+ .key = {"resync-failed-syncs-after-fsync"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {GD_OP_VERSION_3_7_7},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .tags = {"write-behind"},
+ .description = "If sync of \"cached-writes issued before fsync\" "
+ "(to backend) fails, this option configures whether "
+ "to retry syncing them after fsync or forget them. "
+ "If set to on, cached-writes are retried "
+ "till a \"flush\" fop (or a successful sync) on sync "
+ "failures. "
+ "fsync itself is failed irrespective of the value of "
+ "this option. ",
+ },
+ {
+ .key = {"aggregate-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .default_value = "128KB",
+ .op_version = {GD_OP_VERSION_4_1_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .description = "Will aggregate writes until data of specified "
+ "size is fully filled for a single file provided "
+ "there are no dependent fops on cached writes. This "
+ "option just sets the aggregate size. Note that "
+ "aggregation won't happen if "
+ "performance.write-behind-trickling-writes"
+ " is turned on. Hence turn off "
+ "performance.write-behind.trickling-writes"
+ " so that writes are aggregated till a max of "
+ "\"aggregate-size\" bytes",
+ },
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "write-behind",
+ .category = GF_MAINTAINED,
};