diff options
author | Raghavendra G <rgowdapp@redhat.com> | 2015-11-17 12:57:54 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2015-12-22 01:55:57 -0800 |
commit | 3fcead2de7bcdb4e1312f37e7e750abd8d9d9770 (patch) | |
tree | 6df073e7faf75ed55e997692201494fed547ce84 /xlators/performance | |
parent | d73f5c9eae0f5c8796598fa1930844c15c2c37d4 (diff) |
performance/write-behind: retry "failed syncs to backend"
1. When sync fails, the cached-write is still preserved unless there
is a flush/fsync waiting on it.
2. When a sync fails and there is a flush/fsync waiting on the
cached-write, the cache is thrown away and no further retries will
be made. In other words flush/fsync act as barriers for all the
previous writes. The behaviour of fsync acting as a barrier is
controlled by an option (see below for details). All previous
writes are either successfully synced to backend or forgotten in
case of an error. Without such barrier fop (especially flush which
is issued prior to a close), we end up retrying for ever even after
fd is closed.
3. If a fop is waiting on cached-write and syncing to backend fails,
the waiting fop is failed.
4. sync failures when no fop is waiting are ignored and are not
propagated to application. For eg.,
a. first attempt of sync of a cached-write w1 fails
b. second attempt of sync of w1 succeeds
If there are no fops dependent on w1 are issued b/w a and b,
application won't know about failure encountered in a.
5. The effect of repeated sync failures is that, there will be no
cache for future writes and they cannot be written behind.
fsync as a barrier and resync of cached writes post fsync failure:
==================================================================
Whether to keep retrying failed syncs post fsync is controlled by an
option "resync-failed-syncs-after-fsync". By default, this option is
set to "off".
If sync of "cached-writes issued before fsync" (to backend) fails,
this option configures whether to retry syncing them after fsync or
forget them. If set to on, cached-writes are retried till a "flush"
fop (or a successful sync) on sync failures. fsync itself is failed
irrespective of the value of this option, when there is a sync failure
of any cached-writes issued before fsync.
Change-Id: I6097c9257bfb9ee5b15616fbe6a0576ae9af369a
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
BUG: 1279730
Reviewed-on: http://review.gluster.org/12594
Diffstat (limited to 'xlators/performance')
-rw-r--r-- | xlators/performance/write-behind/src/write-behind.c | 480 |
1 files changed, 376 insertions, 104 deletions
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 1adda4eaff4..285420526f4 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -107,6 +107,14 @@ typedef struct wb_inode { size_t size; /* Size of the file to catch write after EOF. */ gf_lock_t lock; xlator_t *this; + int dontsync; /* If positive, dont pick lies for + * winding. This is needed to break infinite + * recursion during invocation of + * wb_process_queue from + * wb_fulfill_cbk in case of an + * error during fulfill. + */ + } wb_inode_t; @@ -144,6 +152,8 @@ typedef struct wb_request { request arrival */ fd_t *fd; + int wind_count; /* number of sync-attempts. Only + for debug purposes */ struct { size_t size; /* 0 size == till infinity */ off_t off; @@ -164,6 +174,7 @@ typedef struct wb_conf { gf_boolean_t trickling_writes; gf_boolean_t strict_write_ordering; gf_boolean_t strict_O_DIRECT; + gf_boolean_t resync_after_fsync; } wb_conf_t; @@ -202,26 +213,6 @@ out: } -gf_boolean_t -wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) -{ - gf_boolean_t err = _gf_false; - uint64_t value = 0; - int32_t tmp = 0; - - if (fd_ctx_get (fd, this, &value) == 0) { - if (op_errno) { - tmp = value; - *op_errno = tmp; - } - - err = _gf_true; - } - - return err; -} - - /* Below is a succinct explanation of the code deciding whether two regions overlap, from Pavan <tcp@gluster.com>. @@ -305,17 +296,17 @@ wb_requests_conflict (wb_request_t *lie, wb_request_t *req) } -gf_boolean_t +wb_request_t * wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req) { wb_request_t *each = NULL; list_for_each_entry (each, &wb_inode->liability, lie) { if (wb_requests_conflict (each, req)) - return _gf_true; + return each; } - return _gf_false; + return NULL; } @@ -552,6 +543,9 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) break; default: + if (stub && stub->args.fd) + req->fd = fd_ref (stub->args.fd); + break; } @@ -679,6 +673,88 @@ __wb_fulfill_request (wb_request_t *req) } +/* get a flush/fsync waiting on req */ +wb_request_t * +__wb_request_waiting_on (wb_request_t *req) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *trav = NULL; + + wb_inode = req->wb_inode; + + list_for_each_entry (trav, &wb_inode->todo, todo) { + if ((trav->fd == req->fd) + && ((trav->stub->fop == GF_FOP_FLUSH) + || (trav->stub->fop == GF_FOP_FSYNC)) + && (trav->gen >= req->gen)) + return trav; + } + + return NULL; +} + + +void +__wb_fulfill_request_err (wb_request_t *req, int32_t op_errno) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *waiter = NULL; + wb_conf_t *conf = NULL; + + wb_inode = req->wb_inode; + + conf = wb_inode->this->private; + + req->op_ret = -1; + req->op_errno = op_errno; + + if (req->ordering.lied) + waiter = __wb_request_waiting_on (req); + + if (!req->ordering.lied || waiter) { + if (!req->ordering.lied) { + /* response to app is still pending, send failure in + * response. + */ + } else { + /* response was sent, store the error in a + * waiter (either an fsync or flush). + */ + waiter->op_ret = -1; + waiter->op_errno = op_errno; + } + + if (!req->ordering.lied + || (waiter->stub->fop == GF_FOP_FLUSH) + || ((waiter->stub->fop == GF_FOP_FSYNC) + && !conf->resync_after_fsync)) { + /* No retry needed, forget the request */ + __wb_fulfill_request (req); + return; + } + } + + /* response was unwound and no waiter waiting on this request, retry + till a flush or fsync (subject to conf->resync_after_fsync). + */ + wb_inode->transit -= req->total_size; + + req->total_size = 0; + + list_del_init (&req->winds); + list_del_init (&req->todo); + list_del_init (&req->wip); + + /* sanitize ordering flags to retry */ + req->ordering.go = 0; + + /* Add back to todo list to retry */ + list_add (&req->todo, &wb_inode->todo); + + return; +} + + void wb_head_done (wb_request_t *head) { @@ -693,6 +769,7 @@ wb_head_done (wb_request_t *head) list_for_each_entry_safe (req, tmp, &head->winds, winds) { __wb_fulfill_request (req); } + __wb_fulfill_request (head); } UNLOCK (&wb_inode->lock); @@ -700,29 +777,130 @@ wb_head_done (wb_request_t *head) void +__wb_fulfill_err (wb_request_t *head, int op_errno) +{ + wb_request_t *req = NULL, *tmp = NULL; + + if (!head) + goto out; + + head->wb_inode->dontsync++; + + list_for_each_entry_safe_reverse (req, tmp, &head->winds, + winds) { + __wb_fulfill_request_err (req, op_errno); + } + + __wb_fulfill_request_err (head, op_errno); + +out: + return; +} + + +void wb_fulfill_err (wb_request_t *head, int op_errno) { - wb_inode_t *wb_inode; - wb_request_t *req; + wb_inode_t *wb_inode = NULL; wb_inode = head->wb_inode; - /* for all future requests yet to arrive */ - fd_ctx_set (head->fd, THIS, op_errno); - LOCK (&wb_inode->lock); { - /* for all requests already arrived */ - list_for_each_entry (req, &wb_inode->all, all) { - if (req->fd != head->fd) - continue; - req->op_ret = -1; - req->op_errno = op_errno; - } + __wb_fulfill_err (head, op_errno); + } UNLOCK (&wb_inode->lock); } +inline void +__wb_modify_write_request (wb_request_t *req, int synced_size, + int head_total_size) +{ + struct iovec *vector = NULL; + int count = 0; + + if (!req || synced_size == 0) + goto out; + + req->write_size -= synced_size; + req->stub->args.offset += synced_size; + req->total_size = head_total_size; + + vector = req->stub->args.vector; + count = req->stub->args.count; + + req->stub->args.count = iov_subset (vector, count, synced_size, + iov_length (vector, count), vector); + +out: + return; +} + +int +__wb_fulfill_short_write (wb_request_t *req, int size, int total_size) +{ + int accounted_size = 0; + + if (req == NULL) + goto out; + + if (req->write_size <= size) { + accounted_size = req->write_size; + __wb_fulfill_request (req); + } else { + accounted_size = size; + __wb_modify_write_request (req, size, total_size); + } + +out: + return accounted_size; +} + +void +wb_fulfill_short_write (wb_request_t *head, int size) +{ + wb_inode_t *wb_inode = NULL; + wb_request_t *req = NULL, *tmp = NULL; + int total_size = 0, accounted_size = 0; + + if (!head) + goto out; + + wb_inode = head->wb_inode; + + total_size = head->total_size - size; + head->total_size = size; + + req = head; + + LOCK (&wb_inode->lock); + { + accounted_size = __wb_fulfill_short_write (head, size, + total_size); + + size -= accounted_size; + + if (size == 0) + goto done; + + list_for_each_entry_safe (req, tmp, &head->winds, winds) { + accounted_size = __wb_fulfill_short_write (req, size, + total_size); + size -= accounted_size; + + if (size == 0) + break; + + } + } +done: + UNLOCK (&wb_inode->lock); + + wb_fulfill_err (req, EIO); +out: + return; +} int wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -740,18 +918,10 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) { wb_fulfill_err (head, op_errno); } else if (op_ret < head->total_size) { - /* - * We've encountered a short write, for whatever reason. - * Set an EIO error for the next fop. This should be - * valid for writev or flush (close). - * - * TODO: Retry the write so we can potentially capture - * a real error condition (i.e., ENOSPC). - */ - wb_fulfill_err (head, EIO); - } - - wb_head_done (head); + wb_fulfill_short_write (head, op_ret); + } else { + wb_head_done (head); + } wb_process_queue (wb_inode); @@ -776,10 +946,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) int count = 0; wb_request_t *req = NULL; call_frame_t *frame = NULL; - gf_boolean_t fderr = _gf_false; - xlator_t *this = NULL; - - this = THIS; /* make sure head->total_size is updated before we run into any * errors @@ -795,11 +961,6 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) goto err; } - if (wb_fd_err (head->fd, this, NULL)) { - fderr = _gf_true; - goto err; - } - frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool); if (!frame) goto err; @@ -822,26 +983,21 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) return 0; err: - if (!fderr) { - /* frame creation failure */ - fderr = ENOMEM; - wb_fulfill_err (head, fderr); - } - - wb_head_done (head); + /* frame creation failure */ + wb_fulfill_err (head, ENOMEM); - return fderr; + return ENOMEM; } -#define NEXT_HEAD(head, req) do { \ - if (head) \ - ret |= wb_fulfill_head (wb_inode, head); \ - head = req; \ - expected_offset = req->stub->args.offset + \ - req->write_size; \ - curr_aggregate = 0; \ - vector_count = 0; \ +#define NEXT_HEAD(head, req) do { \ + if (head) \ + ret |= wb_fulfill_head (wb_inode, head); \ + head = req; \ + expected_offset = req->stub->args.offset + \ + req->write_size; \ + curr_aggregate = 0; \ + vector_count = 0; \ } while (0) @@ -1053,6 +1209,17 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) conf = wb_inode->this->private; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { + if (wb_inode->dontsync && req->ordering.lied) { + /* sync has failed. Don't pick lies _again_ for winding + * as winding these lies again will trigger an infinite + * recursion of wb_process_queue being called from a + * failed fulfill. However, pick non-lied requests for + * winding so that application wont block indefinitely + * waiting for write result. + */ + continue; + } + if (!req->ordering.tempted) { if (holder) { if (wb_requests_conflict (holder, req)) @@ -1124,20 +1291,96 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) if (conf->trickling_writes && !wb_inode->transit && holder) holder->ordering.go = 1; + if (wb_inode->dontsync > 0) + wb_inode->dontsync--; + return; } +int +__wb_handle_failed_conflict (wb_request_t *req, wb_request_t *conflict, + list_head_t *tasks) +{ + wb_conf_t *conf = NULL; + + conf = req->wb_inode->this->private; + + if ((req->stub->fop != GF_FOP_FLUSH) + && ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) { + if (!req->ordering.lied && list_empty (&conflict->wip)) { + /* If request itself is in liability queue, + * 1. We cannot unwind as the response has already been + * sent. + * 2. We cannot wind till conflict clears up. + * 3. So, skip the request for now. + * 4. Otherwise, resume (unwind) it with error. + */ + req->op_ret = -1; + req->op_errno = conflict->op_errno; + + list_del_init (&req->todo); + list_add_tail (&req->winds, tasks); + } + } else { + /* flush and fsync (without conf->resync_after_fsync) act as + barriers. We cannot unwind them out of + order, when there are earlier generation writes just because + there is a conflicting liability with an error. So, wait for + our turn till there are no conflicting liabilities. + + This situation can arise when there liabilities spread across + multiple generations. For eg., consider two writes with + following characterstics: + + 1. they belong to different generations gen1, gen2 and + (gen1 > gen2). + 2. they overlap. + 3. both are liabilities. + 4. gen1 write was attempted to sync, but the attempt failed. + 5. there was no attempt to sync gen2 write yet. + 6. A flush (as part of close) is issued and gets a gen no + gen3. + + In the above scenario, if flush is unwound without waiting + for gen1 and gen2 writes either to be successfully synced or + purged, we end up with these two writes in wb_inode->todo + list forever as there will be no attempt to process the queue + as flush is the last operation. + */ + } + + return 0; +} + -void +int __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, list_head_t *liabilities) { - wb_request_t *req = NULL; - wb_request_t *tmp = NULL; + wb_request_t *req = NULL; + wb_request_t *tmp = NULL; + wb_request_t *conflict = NULL; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { - if (wb_liability_has_conflict (wb_inode, req)) - continue; + conflict = wb_liability_has_conflict (wb_inode, req); + if (conflict) { + if (conflict->op_ret == -1) { + /* There is a conflicting liability which failed + * to sync in previous attempts, resume the req + * and fail, unless its an fsync/flush. + */ + + __wb_handle_failed_conflict (req, conflict, + tasks); + } else { + /* There is a conflicting liability which was + * not attempted to sync even once. Wait till + * atleast one attempt to sync is made. + */ + } + + continue; + } if (req->ordering.tempted && !req->ordering.go) /* wait some more */ @@ -1148,6 +1391,7 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, continue; list_add_tail (&req->wip, &wb_inode->wip); + req->wind_count++; if (!req->ordering.tempted) /* unrefed in wb_writev_cbk */ @@ -1162,6 +1406,8 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks, else list_add_tail (&req->winds, tasks); } + + return 0; } @@ -1174,7 +1420,12 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) list_for_each_entry_safe (req, tmp, tasks, winds) { list_del_init (&req->winds); - call_resume (req->stub); + if (req->op_ret == -1) { + call_unwind_error (req->stub, req->op_ret, + req->op_errno); + } else { + call_resume (req->stub); + } wb_request_unref (req); } @@ -1184,10 +1435,10 @@ wb_do_winds (wb_inode_t *wb_inode, list_head_t *tasks) void wb_process_queue (wb_inode_t *wb_inode) { - list_head_t tasks = {0, }; - list_head_t lies = {0, }; - list_head_t liabilities = {0, }; - int retry = 0; + list_head_t tasks = {0, }; + list_head_t lies = {0, }; + list_head_t liabilities = {0, }; + int wind_failure = 0; INIT_LIST_HEAD (&tasks); INIT_LIST_HEAD (&lies); @@ -1209,13 +1460,12 @@ wb_process_queue (wb_inode_t *wb_inode) wb_do_winds (wb_inode, &tasks); - /* fd might've been marked bad due to previous errors. - * Since, caller of wb_process_queue might be the last fop on - * inode, make sure we keep processing request queue, till there - * are no requests left. + /* If there is an error in wb_fulfill before winding write + * requests, we would miss invocation of wb_process_queue + * from wb_fulfill_cbk. So, retry processing again. */ - retry = wb_fulfill (wb_inode, &liabilities); - } while (retry); + wind_failure = wb_fulfill (wb_inode, &liabilities); + } while (wind_failure); return; } @@ -1285,10 +1535,6 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, conf = this->private; - if (wb_fd_err (fd, this, &op_errno)) { - goto unwind; - } - wb_inode = wb_inode_create (this, fd->inode); if (!wb_inode) { op_errno = ENOMEM; @@ -1309,7 +1555,7 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, count, offset, flags, iobref, xdata); else stub = fop_writev_stub (frame, NULL, fd, vector, count, offset, - flags, iobref, xdata); + flags, iobref, xdata); if (!stub) { op_errno = ENOMEM; goto unwind; @@ -1413,10 +1659,6 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) goto unwind; } - if (wb_fd_err (fd, this, &op_errno)) { - op_ret = -1; - goto unwind; - } if (conf->flush_behind) goto flushbehind; @@ -1495,9 +1737,6 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, call_stub_t *stub = NULL; int32_t op_errno = EINVAL; - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; - wb_inode = wb_inode_ctx_get (this, fd->inode); if (!wb_inode) goto noqueue; @@ -1720,9 +1959,6 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, goto unwind; } - if (wb_fd_err (fd, this, &op_errno)) - goto unwind; - frame->local = wb_inode; stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, @@ -2003,7 +2239,18 @@ __wb_dump_requests (struct list_head *head, char *prefix) else gf_proc_dump_write ("wound", "no"); + gf_proc_dump_write ("generation-number", "%d", req->gen); + + gf_proc_dump_write ("req->op_ret", "%d", req->op_ret); + gf_proc_dump_write ("req->op_errno", "%d", req->op_errno); + gf_proc_dump_write ("sync-attempts", "%d", req->wind_count); + if (req->fop == GF_FOP_WRITE) { + if (list_empty (&req->wip)) + gf_proc_dump_write ("sync-in-progress", "no"); + else + gf_proc_dump_write ("sync-in-progress", "yes"); + gf_proc_dump_write ("size", "%"GF_PRI_SIZET, req->write_size); @@ -2021,6 +2268,7 @@ __wb_dump_requests (struct list_head *head, char *prefix) flag = req->ordering.go; gf_proc_dump_write ("go", "%d", flag); + } } } @@ -2066,6 +2314,11 @@ wb_inode_dump (xlator_t *this, inode_t *inode) wb_inode->window_current); + gf_proc_dump_write ("transit-size", "%"GF_PRI_SIZET, + wb_inode->transit); + + gf_proc_dump_write ("dontsync", "%d", wb_inode->dontsync); + ret = TRY_LOCK (&wb_inode->lock); if (!ret) { @@ -2117,7 +2370,8 @@ reconfigure (xlator_t *this, dict_t *options) conf = this->private; - GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out); + GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, + out); GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, out); @@ -2130,6 +2384,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("strict-write-ordering", conf->strict_write_ordering, options, bool, out); + GF_OPTION_RECONF ("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, options, bool, out); + ret = 0; out: return ret; @@ -2196,6 +2453,9 @@ init (xlator_t *this) GF_OPTION_INIT ("strict-write-ordering", conf->strict_write_ordering, bool, out); + GF_OPTION_INIT ("resync-failed-syncs-after-fsync", + conf->resync_after_fsync, bool, out); + this->private = conf; ret = 0; @@ -2287,5 +2547,17 @@ struct volume_options options[] = { .description = "Do not let later writes overtake earlier writes even " "if they do not overlap", }, + { .key = {"resync-failed-syncs-after-fsync"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If sync of \"cached-writes issued before fsync\" " + "(to backend) fails, this option configures whether " + "to retry syncing them after fsync or forget them. " + "If set to on, cached-writes are retried " + "till a \"flush\" fop (or a successful sync) on sync " + "failures. " + "fsync itself is failed irrespective of the value of " + "this option. ", + }, { .key = {NULL} }, }; |