diff options
-rw-r--r-- | libglusterfs/src/common-utils.h | 6 | ||||
-rwxr-xr-x | tests/bugs/rdma/bug-765473.t | 35 | ||||
-rw-r--r-- | tests/bugs/write-behind/bug-1279730.c | 131 | ||||
-rwxr-xr-x | tests/bugs/write-behind/bug-1279730.t | 31 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 14 | ||||
-rw-r--r-- | xlators/mount/fuse/src/fuse-bridge.c | 7 | ||||
-rw-r--r-- | xlators/performance/write-behind/src/write-behind.c | 480 |
7 files changed, 562 insertions, 142 deletions
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index 0e7aa016cbd..f5f4493e21b 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -432,10 +432,12 @@ iov_subset (struct iovec *orig, int orig_count, int i; off_t offset = 0; size_t start_offset = 0; - size_t end_offset = 0; + size_t end_offset = 0, origin_iov_len = 0; for (i = 0; i < orig_count; i++) { + origin_iov_len = orig[i].iov_len; + if ((offset + orig[i].iov_len < src_offset) || (offset > dst_offset)) { goto not_subset; @@ -463,7 +465,7 @@ iov_subset (struct iovec *orig, int orig_count, new_count++; not_subset: - offset += orig[i].iov_len; + offset += origin_iov_len; } return new_count; diff --git a/tests/bugs/rdma/bug-765473.t b/tests/bugs/rdma/bug-765473.t deleted file mode 100755 index 9f595a1d479..00000000000 --- a/tests/bugs/rdma/bug-765473.t +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc -. $(dirname $0)/../../fileio.rc - -cleanup; - -function clients_connected() -{ - volname=$1 - gluster volume status $volname clients | grep -i 'Clients connected' | sed -e 's/[^0-9]*\(.*\)/\1/g' -} - -## Start and create a volume -TEST glusterd; -TEST pidof glusterd; -TEST $CLI volume create $V0 $H0:$B0/${V0}1 -TEST $CLI volume start $V0; - -TEST glusterfs --direct-io-mode=yes --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0; - -TEST fd=`fd_available` -TEST fd_open $fd 'w' "$M0/testfile" -TEST fd_write $fd "content" -TEST $CLI volume stop $V0 -# write some content which will result in marking fd bad -fd_write $fd "more content" -sync $V0 -TEST $CLI volume start $V0 -EXPECT 'Started' volinfo_field $V0 'Status'; -EXPECT_WITHIN $PROCESS_UP_TIMEOUT 2 clients_connected $V0 -TEST ! fd_write $fd "still more content" - -cleanup diff --git a/tests/bugs/write-behind/bug-1279730.c b/tests/bugs/write-behind/bug-1279730.c new file mode 100644 index 00000000000..535d289c582 --- /dev/null +++ b/tests/bugs/write-behind/bug-1279730.c @@ -0,0 +1,131 @@ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> + +int +main (int argc, char *argv[]) +{ + int fd = -1, ret = -1, len = 0; + char *path = NULL, buf[128] = {0, }, *cmd = NULL; + struct stat stbuf = {0, }; + int write_to_child[2] = {0, }, write_to_parent[2] = {0, }; + + path = argv[1]; + cmd = argv[2]; + + assert (argc == 3); + + ret = pipe (write_to_child); + if (ret < 0) { + fprintf (stderr, "creation of write-to-child pipe failed " + "(%s)\n", strerror (errno)); + goto out; + } + + ret = pipe (write_to_parent); + if (ret < 0) { + fprintf (stderr, "creation of write-to-parent pipe failed " + "(%s)\n", strerror (errno)); + goto out; + } + + ret = fork (); + switch (ret) { + case 0: + close (write_to_child[1]); + close (write_to_parent[0]); + + /* child, wait for instructions to execute command */ + ret = read (write_to_child[0], buf, 128); + if (ret < 0) { + fprintf (stderr, "child: read on pipe failed (%s)\n", + strerror (errno)); + goto out; + } + + system (cmd); + + ret = write (write_to_parent[1], "1", 2); + if (ret < 0) { + fprintf (stderr, "child: write to pipe failed (%s)\n", + strerror (errno)); + goto out; + } + break; + + case -1: + fprintf (stderr, "fork failed (%s)\n", strerror (errno)); + goto out; + + default: + close (write_to_parent[1]); + close (write_to_child[0]); + + fd = open (path, O_CREAT | O_RDWR | O_APPEND, S_IRWXU); + if (fd < 0) { + fprintf (stderr, "open failed (%s)\n", + strerror (errno)); + goto out; + } + + len = strlen ("test-content") + 1; + ret = write (fd, "test-content", len); + + if (ret < len) { + fprintf (stderr, "write failed %d (%s)\n", ret, + strerror (errno)); + } + + ret = pread (fd, buf, 128, 0); + if ((ret == len) && (strcmp (buf, "test-content") == 0)) { + fprintf (stderr, "read should've failed as previous " + "write would've failed with EDQUOT, but its " + "successful"); + ret = -1; + goto out; + } + + ret = write (write_to_child[1], "1", 2); + if (ret < 0) { + fprintf (stderr, "parent: write to pipe failed (%s)\n", + strerror (errno)); + goto out; + } + + ret = read (write_to_parent[0], buf, 128); + if (ret < 0) { + fprintf (stderr, "parent: read from pipe failed (%s)\n", + strerror (errno)); + goto out; + } + + /* this will force a sync on cached-write and now that quota + limit is increased, sync will be successful. ignore return + value as fstat would fail with EDQUOT (picked up from + cached-write because of previous sync failure. + */ + fstat (fd, &stbuf); + + ret = pread (fd, buf, 128, 0); + if (ret != len) { + fprintf (stderr, "post cmd read failed %d (data:%s) " + "(error:%s)\n", ret, buf, strerror (errno)); + goto out; + } + + if (strcmp (buf, "test-content")) { + fprintf (stderr, "wrong data (%s)\n", buf); + goto out; + } + } + + ret = 0; + +out: + return ret; +} diff --git a/tests/bugs/write-behind/bug-1279730.t b/tests/bugs/write-behind/bug-1279730.t new file mode 100755 index 00000000000..38e564b7afc --- /dev/null +++ b/tests/bugs/write-behind/bug-1279730.t @@ -0,0 +1,31 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fileio.rc + +cleanup; + +## Start and create a volume +TEST glusterd; +TEST pidof glusterd; +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/$V0; +TEST $CLI volume start $V0; +TEST $CLI volume quota $V0 enable +TEST $CLI volume quota $V0 limit-usage / 4 + +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# compile the test program and run it +TEST $CC -O0 -g3 $(dirname $0)/bug-1279730.c -o $(dirname $0)/bug-1279730 + +TEST $(dirname $0)/bug-1279730 $M0/file "\"$CLI volume quota $V0 limit-usage / 1024\"" + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +TEST $CLI volume stop $V0; +TEST $CLI volume delete $V0; + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index b5e5d5fbf7b..593ae2aa833 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1215,6 +1215,20 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "performance.resync-failed-syncs-after-fsync", + .voltype = "performance/write-behind", + .option = "resync-failed-syncs-after-fsync", + .op_version = GD_OP_VERSION_3_7_6, + .flags = OPT_FLAG_CLIENT_OPT, + .description = "If sync of \"cached-writes issued before fsync\" " + "(to backend) fails, this option configures whether " + "to retry syncing them after fsync or forget them. " + "If set to on, cached-writes are retried " + "till a \"flush\" fop (or a successful sync) on sync " + "failures. " + "fsync itself is failed irrespective of the value of " + "this option. ", + }, { .key = "performance.nfs.write-behind-window-size", .voltype = "performance/write-behind", .option = "cache-size", diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index fcc77c51be9..eba8a6ddb0c 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -4968,11 +4968,16 @@ fuse_thread_proc (void *data) int32_t fuse_itable_dump (xlator_t *this) { + fuse_private_t *priv = NULL; + if (!this) return -1; + priv = this->private; + gf_proc_dump_add_section("xlator.mount.fuse.itable"); - inode_table_dump(this->itable, "xlator.mount.fuse.itable"); + inode_table_dump(priv->active_subvol->itable, + "xlator.mount.fuse.itable"); return 0; } diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 1adda4eaff4..285420526f4 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -107,6 +107,14 @@ typedef struct wb_inode { size_t size; /* Size of the file to catch write after EOF. */ gf_lock_t lock; xlator_t *this; + int dontsync; /* If positive, dont pick lies for + * winding. This is needed to break infinite + * recursion during invocation of + * wb_process_queue from + * wb_fulfill_cbk in case of an + * error during fulfill. + */ + } wb_inode_t; @@ -144,6 +152,8 @@ typedef struct wb_request { request arrival */ fd_t *fd; + int wind_count; /* number of sync-attempts. Only + for debug purposes */ struct { size_t size; /* 0 size == till infinity */ off_t off; @@ -164,6 +174,7 @@ typedef struct wb_conf { gf_boolean_t trickling_writes; gf_boolean_t strict_write_ordering; gf_boolean_t strict_O_DIRECT; + gf_boolean_t resync_after_fsync; } wb_conf_t; @@ -202,26 +213,6 @@ out: } -gf_boolean_t -wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) -{ - gf_boolean_t err = _gf_false; - uint64_t value = 0; - int32_t tmp = 0; - - if (fd_ctx_get (fd, this, &value) == 0) { - if (op_errno) { - tmp = value; - *op_errno = tmp; - } - - err = _gf_true; - } - - return err; -} - - /* Below is a succinct explanation of the code deciding whether two regions overlap, from Pavan <tcp@gluster.com>. @@ -305,17 +296,17 @@ wb_requests_conflict (wb_request_t *lie, wb_request_t *req) } -gf_boolean_t +wb_request_t * wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) { wb_request_t *each = NULL; list_for_each_entry (each, &wb_inode->liability, lie) { if (wb_requests_conflict (each, req)) - return _gf_true; + return each; } - return _gf_false; + return NULL; } @@ -552,6 +543,9 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) break; default: + if (stub && stub->args.fd) + req->fd = fd_ref (stub->args.fd); + break; } @@ -679,6 +673,88 @@ __wb_fulfill_request (wb_request_t *req) } +/* get a flush/fsync waiting on req */ +wb_request_t * +__wb_request_waiting_on (wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *trav = NULL; + + wb_inode = req->wb_inode; + + list_for_each_entry (trav, &wb_inode->todo, todo) { + if ((trav->fd == req->fd) + && ((trav->stub->fop == GF_FOP_FLUSH) + || (trav->stub->fop == GF_FOP_FSYNC)) + && (trav->gen >= req->gen)) + return trav; + } + + return NULL; +} + + +void +__wb_fulfill_request_err (wb_request_t *req, int32_t op_errno) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *waiter = NULL; + wb_conf_t *conf = NULL; + + wb_inode = req->wb_inode; + + conf = wb_inode->this->private; + + req->op_ret = -1; + req->op_errno = op_errno; + + if (req->ordering.lied) + waiter = __wb_request_waiting_on (req); + + if (!req->ordering.lied || waiter) { + if (!req->ordering.lied) { + /* response to app is still pending, send failure in + * response. + */ + } else { + /* response was sent, store the error in a + * waiter (either an fsync or flush). + */ + waiter->op_ret = -1; + waiter->op_errno = op_errno; + } + + if (!req->ordering.lied + || (waiter->stub->fop == GF_FOP_FLUSH) + || ((waiter->stub->fop == GF_FOP_FSYNC) + && !conf->resync_after_fsync)) { + /* No retry needed, forget the request */ + __wb_fulfill_request (req); + return; + } + } + + /* response was unwound and no waiter waiting on this request, retry + till a flush or fsync (subject to conf->resync_after_fsync). + */ + wb_inode->transit -= req->total_size; + + req->total_size = 0; + + list_del_init (&req->winds); + list_del_init (&req->todo); + list_del_init (&req->wip); + + /* sanitize ordering flags to retry */ + req->ordering.go = 0; + + /* Add back to todo list to retry */ + list_add (&req->todo, &wb_inode->todo); + + return; +} + + void wb_head_done (wb_request_t *head) { @@ -693,6 +769,7 @@ wb_head_done (wb_request_t *head) list_for_each_entry_safe (req, tmp, &head->winds, winds) { __wb_fulfill_request (req); } + __wb_fulfill_request (head); } UNLOCK (&wb_inode->lock); @@ -700,29 +777,130 @@ wb_head_done (wb_request_t *head) void +__wb_fulfill_err (wb_request_t *head, int op_errno) +{ + wb_request_t *req = NULL, *tmp = NULL; + + if (!head) + goto out; + + head->wb_inode->dontsync++; + + list_for_each_entry_safe_reverse (req, tmp, &head->winds, + winds) { + __wb_fulfill_request_err (req, op_errno); + } + + __wb_fulfill_request_err (head, op_errno); + +out: + return; +} + + +void wb_fulfill_err (wb_request_t *head, int op_errno) { - wb_inode_t *wb_inode; - wb_request_t *req; + wb_inode_t *wb_inode = NULL; wb_inode = head->wb_inode; - /* for all future requests yet to arrive */ - fd_ctx_set (head->fd, THIS, op_errno); - LOCK (&wb_inode->lock); { - /* for all requests already arrived */ - list_for_each_entry (req, &wb_inode->all, all) { - if (req->fd != head->fd) - continue; - req->op_ret = -1; - req->op_errno = op_errno; - } + __wb_fulfill_err (head, op_errno); + } UNLOCK (&wb_inode->lock); } +inline void +__wb_modify_write_request (wb_request_t *req, int synced_size, + int head_total_size) +{ + struct iovec *vector = NULL; + int count = 0; + + if (!req || synced_size == 0) + goto out; + + req->write_size -= synced_size; + req->stub->args.offset += synced_size; + req->total_size = head_total_size; + + vector = req->stub->args.vector; + count = req->stub->args.count; + + req->stub->args.count = iov_subset (vector, count, synced_size, + iov_length (vector, count), vector); + +out: + return; +} + +int +__wb_fulfill_short_write (wb_request_t *req, int size, int total_size) +{ + int accounted_size = 0; + + if (req == NULL) + goto out; + + if (req->write_size <= size) { + accounted_size = req->write_size; + __wb_fulfill_request (req); + } else { + accounted_size = size; + __wb_modify_write_request (req, size, total_size); + } + +out: + return accounted_size; +} + +void +wb_fulfill_short_write (wb_request_t *head, int size) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *req = NULL, *tmp = NULL; + int total_size = 0, accounted_size = 0; + + if (!head) + goto out; + + wb_inode = head->wb_inode; + + total_size = head->total_size - size; + head->total_size = size; + + req = head; + + LOCK (&wb_inode->lock); + { + accounted_size = __wb_fulfill_short_write (head, size, + total_size); + + size -= accounted_size; + + if (size == 0) + goto done; + + list_for_each_entry_safe (req, tmp, &head->winds, winds) { + accounted_size = __wb_fulfill_short_write (req, size, + total_size); + size -= accounted_size; + + if (size == 0) + break; + + } + } +done: + UNLOCK (&wb_inode->lock); + + wb_fulfill_err (req, EIO); +out: + return; +} int wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -740,18 +918,10 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) { wb_fulfill_err (head, op_errno); } else if (op_ret < head->total_size) { - /* - * We've encountered a short write, for whatever reason. - * Set an EIO error for the next fop. This should be - * valid for writev or flush (close). - * - * TODO: Retry the write so we can potentially capture - * a real error condition (i.e., ENOSPC). - */ - wb_fulfill_err (head, EIO); - } - - wb_head_done (head); + wb_fulfill_short_write (head, op_ret); + } else { + wb_head_done (head); + } wb_process_queue (wb_inode); @@ -776,10 +946,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) int count = 0; wb_request_t *req = NULL; call_frame_t *frame = NULL; - gf_boolean_t fderr = _gf_false; - xlator_t *this = NULL; - - this = THIS; /* make sure head->total_size is updated before we run into any * errors @@ -795,11 +961,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) goto err; } - if (wb_fd_err (head->fd, this, NULL)) { - fderr = _gf_true; - goto err; - } - frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool); if (!frame) goto err; @@ -822,26 +983,21 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) return 0; err: - if (!fderr) { - /* frame creation failure */ - fderr = ENOMEM; - wb_fulfill_err (head, fderr); - } - - wb_head_done (head); + /* frame creation failure */ + wb_fulfill_err (head, ENOMEM); - return fderr; + return ENOMEM; } -#define NEXT_HEAD(head, req) do { \ - if (head) \ - ret |= wb_fulfill_head (wb_inode, head); \ - head = req; \ - expected_offset = req->stub->args.offset + \ - req->write_size; \ - curr_aggregate = 0; \ - vector_count = 0; \ +#define NEXT_HEAD(head, req) do { \ + if (head) \ + ret |= wb_fulfill_head (wb_inode, head); \ + head = req; \ + expected_offset = req->stub->args.offset + \ + req->write_size; \ + curr_aggregate = 0; \ + vector_count = 0; \ } while (0) @@ -1053,6 +1209,17 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) conf = wb_inode->this->private; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { + if (wb_inode->dontsync && req->ordering.lied) { + /* sync has failed. Don't pick lies _again_ for winding + * as winding these lies again will trigger an infinite + * recursion of wb_process_queue being called from a + * failed fulfill. However, pick non-lied requests for + * winding so that application wont block indefinitely + * waiting for write result. + */ + continue; + } + if (!req->ordering.tempted) { if (holder) { if (wb_requests_conflict (holder, req)) @@ -1124,20 +1291,96 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) if (conf->trickling_writes && !wb_inode->transit && holder) holder->ordering.go = 1; + if (wb_inode->dontsync > 0) + wb_inode->dontsync--; + return; } +int +__wb_handle_failed_conflict (wb_request_t *req, wb_request_t *conflict, + list_head_t *tasks) +{ + wb_conf_t *conf = NULL; + + conf = req->wb_inode->this->private; + + if ((req->stub->fop != GF_FOP_FLUSH) + && ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) { + if (!req->ordering.lied && list_empty (&conflict->wip)) { + /* If request itself is in liability queue, + * 1. We cannot unwind as the response has already been + * sent. + * 2. We cannot wind till conflict clears up. + * 3. So, skip the request for now. + * 4. Otherwise, resume (unwind) it with error. + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; + + list_del_init (&req->todo); + list_add_tail (&req->winds, tasks); + } + } else { + /* flush and fsync (without conf->resync_after_fsync) act as + barriers. We cannot unwind them out of + order, when there are earlier generation writes just because + there is a conflicting liability with an error. So, wait for + our turn till there are no conflicting liabilities. + + This situation can arise when there liabilities spread across + multiple generations. For eg., consider two writes with + following characterstics: + + 1. they belong to different generations gen1, gen2 and + (gen1 > gen2). + 2. they overlap. + 3. both are liabilities. + 4. gen1 write was attempted to sync, but the attempt failed. + 5. there was no attempt to sync gen2 write yet. + 6. A flush (as part of close) is issued and gets a gen no + gen3. + + In the above scenario, if flush is unwound without waiting + for gen1 and gen2 writes either to be successfully synced or + purged, we end up with these two writes in wb_inode->todo + list forever as there will be no attempt to process the queue + as flush is the last operation. + */ + } + + return 0; +} + -void +int __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, list_head_t *liabilities) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *conflict = NULL; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { - if (wb_liability_has_conflict (wb_inode, req)) - continue; + conflict = wb_liability_has_conflict (wb_inode, req); + if (conflict) { + if (conflict->op_ret == -1) { + /* There is a conflicting liability which failed + * to sync in previous attempts, resume the req + * and fail, unless its an fsync/flush. + */ + + __wb_handle_failed_conflict (req, conflict, + tasks); + } else { + /* There is a conflicting liability which was + * not attempted to sync even once. Wait till + * atleast one attempt to sync is made. + */ + } + + continue; + } if (req->ordering.tempted && !req->ordering.go) /* wait some more */ @@ -1148,6 +1391,7 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, continue; list_add_tail (&req->wip, &wb_inode->wip); + req->wind_count++; if (!req->ordering.tempted) /* unrefed in wb_writev_cbk */ @@ -1162,6 +1406,8 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, else list_add_tail (&req->winds, tasks); } + + return 0; } @@ -1174,7 +1420,12 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) list_for_each_entry_safe (req, tmp, tasks, winds) { list_del_init (&req->winds); - call_resume (req->stub); + if (req->op_ret == -1) { + call_unwind_error (req->stub, req->op_ret, + req->op_errno); + } else { + call_resume (req->stub); + } wb_request_unref (req); } @@ -1184,10 +1435,10 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) void wb_process_queue (wb_inode_t *wb_inode) { - list_head_t tasks = {0, }; - list_head_t lies = {0, }; - list_head_t liabilities = {0, }; - int retry = 0; + list_head_t tasks = {0, }; + list_head_t lies = {0, }; + list_head_t liabilities = {0, }; + int wind_failure = 0; INIT_LIST_HEAD (&tasks); INIT_LIST_HEAD (&lies); @@ -1209,13 +1460,12 @@ wb_process_queue (wb_inode_t *wb_inode) wb_do_winds (wb_inode, &tasks); - /* fd might've been marked bad due to previous errors. - * Since, caller of wb_process_queue might be the last fop on - * inode, make sure we keep processing request queue, till there - * are no requests left. + /* If there is an error in wb_fulfill before winding write + * requests, we would miss invocation of wb_process_queue + * from wb_fulfill_cbk. So, retry processing again. */ - retry = wb_fulfill (wb_inode, &liabilities); - } while (retry); + wind_failure = wb_fulfill (wb_inode, &liabilities); + } while (wind_failure); return; } @@ -1285,10 +1535,6 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, conf = this->private; - if (wb_fd_err (fd, this, &op_errno)) { - goto unwind; - } - wb_inode = wb_inode_create (this, fd->inode); if (!wb_inode) { op_errno = ENOMEM; @@ -1309,7 +1555,7 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, count, offset, flags, iobref, xdata); else stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, - flags, iobref, xdata); + flags, iobref, xdata); if (!stub) { op_errno = ENOMEM; goto unwind; @@ -1413,10 +1659,6 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) goto unwind; } - if (wb_fd_err (fd, this, &op_errno)) { - op_ret = -1; - goto unwind; - } if (conf->flush_behind) goto flushbehind; @@ -1495,9 +1737,6 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, call_stub_t *stub = NULL; int32_t op_errno = EINVAL; - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; - wb_inode = wb_inode_ctx_get (this, fd->inode); if (!wb_inode) goto noqueue; @@ -1720,9 +1959,6 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, goto unwind; } - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; - frame->local = wb_inode; stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, @@ -2003,7 +2239,18 @@ __wb_dump_requests (struct list_head *head, char *prefix) else gf_proc_dump_write ("wound", "no"); + gf_proc_dump_write ("generation-number", "%d", req->gen); + + gf_proc_dump_write ("req->op_ret", "%d", req->op_ret); + gf_proc_dump_write ("req->op_errno", "%d", req->op_errno); + gf_proc_dump_write ("sync-attempts", "%d", req->wind_count); + if (req->fop == GF_FOP_WRITE) { + if (list_empty (&req->wip)) + gf_proc_dump_write ("sync-in-progress", "no"); + else + gf_proc_dump_write ("sync-in-progress", "yes"); + gf_proc_dump_write ("size", "%"GF_PRI_SIZET, req->write_size); @@ -2021,6 +2268,7 @@ __wb_dump_requests (struct list_head *head, char *prefix) flag = req->ordering.go; gf_proc_dump_write ("go", "%d", flag); + } } } @@ -2066,6 +2314,11 @@ wb_inode_dump (xlator_t *this, inode_t *inode) wb_inode->window_current); + gf_proc_dump_write ("transit-size", "%"GF_PRI_SIZET, + wb_inode->transit); + + gf_proc_dump_write ("dontsync", "%d", wb_inode->dontsync); + ret = TRY_LOCK (&wb_inode->lock); if (!ret) { @@ -2117,7 +2370,8 @@ reconfigure (xlator_t *this, dict_t *options) conf = this->private; - GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out); + GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, + out); GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, out); @@ -2130,6 +2384,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering, options, bool, out); + GF_OPTION_RECONF ("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, options, bool, out); + ret = 0; out: return ret; @@ -2196,6 +2453,9 @@ init (xlator_t *this) GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering, bool, out); + GF_OPTION_INIT ("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, bool, out); + this->private = conf; ret = 0; @@ -2287,5 +2547,17 @@ struct volume_options options[] = { .description = "Do not let later writes overtake earlier writes even " "if they do not overlap", }, + { .key = {"resync-failed-syncs-after-fsync"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If sync of \"cached-writes issued before fsync\" " + "(to backend) fails, this option configures whether " + "to retry syncing them after fsync or forget them. " + "If set to on, cached-writes are retried " + "till a \"flush\" fop (or a successful sync) on sync " + "failures. " + "fsync itself is failed irrespective of the value of " + "this option. ", + }, { .key = {NULL} }, }; |