diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2012-01-19 17:49:42 -0500 |
---|---|---|
committer | Anand Avati <avati@gluster.com> | 2012-01-30 05:09:35 -0800 |
commit | c3aa99d907591f72b6302287b9b8899514fb52f1 (patch) | |
tree | 40dceb4deb7e1987caf4004eb76ead91ef4e6fd6 | |
parent | 20d74c540879d3994d56b9baf7044c79ae5df5e3 (diff) |
Fix race between read-ahead and write.
Change-Id: I0ed1aca585733302b5e3840f392849e12f0b0f0d
BUG: 783313
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.com/2666
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@gluster.com>
-rw-r--r-- | xlators/performance/read-ahead/src/page.c | 31 | ||||
-rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.c | 60 | ||||
-rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.h | 3 |
3 files changed, 68 insertions, 26 deletions
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c index 9778ef54258..0c9a61853c8 100644 --- a/xlators/performance/read-ahead/src/page.c +++ b/xlators/performance/read-ahead/src/page.c @@ -175,14 +175,8 @@ ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) file->stbuf = *stbuf; - if (op_ret < 0) { - page = ra_page_get (file, pending_offset); - if (page) - waitq = ra_page_error (page, op_ret, op_errno); - goto unlock; - } - page = ra_page_get (file, pending_offset); + if (!page) { gf_log (this->name, GF_LOG_TRACE, "wasted copy: %"PRId64"[+%"PRId64"] file=%p", @@ -190,6 +184,29 @@ ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* + * "Dirty" means that the request was a pure read-ahead; it's + * set for requests we issue ourselves, and cleared when user + * requests are issued or put on the waitq. "Poisoned" means + * that we got a write while a read was still in flight, and we + * couldn't stop it so we marked it instead. If it's both + * dirty and poisoned by the time we get here, we cancel its + * effect so that a subsequent user read doesn't get data that + * we know is stale (because we made it stale ourselves). We + * can't use ESTALE because that has special significance. + * ECANCELED has no such special meaning, and is close to what + * we're trying to indicate. + */ + if (page->dirty && page->poisoned) { + op_ret = -1; + op_errno = ECANCELED; + } + + if (op_ret < 0) { + waitq = ra_page_error (page, op_ret, op_errno); + goto unlock; + } + if (page->vector) { iobref_unref (page->iobref); GF_FREE (page->vector); diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index 37f34f2eb91..928d8b7bdbc 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -231,7 +231,8 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ static void -flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size) +flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, + int for_write) { ra_page_t *trav = NULL; ra_page_t *next = NULL; @@ -243,8 +244,13 @@ flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size) && trav->offset < (offset + size)) { next = trav->next; - if (trav->offset >= offset && !trav->waitq) { - ra_page_purge (trav); + if (trav->offset >= offset) { + if (!trav->waitq) { + ra_page_purge (trav); + } + else if (for_write) { + trav->poisoned = 1; + } } trav = next; } @@ -392,15 +398,15 @@ dispatch_requests (call_frame_t *frame, ra_file_t *file) trav = ra_page_get (file, trav_offset); if (!trav) { trav = ra_page_create (file, trav_offset); + if (!trav) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } fault = 1; need_atime_update = 0; } - - if (!trav) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } + trav->dirty = 0; if (trav->ready) { gf_log (frame->this->name, GF_LOG_TRACE, @@ -513,7 +519,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, } if (!expected_offset) { - flush_region (frame, file, 0, file->pages.prev->offset + 1); + flush_region (frame, file, 0, file->pages.prev->offset + 1, 0); } local = (void *) GF_CALLOC (1, sizeof (*local), gf_ra_mt_ra_local_t); @@ -536,7 +542,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, dispatch_requests (frame, file); - flush_region (frame, file, 0, floor (offset, file->page_size)); + flush_region (frame, file, 0, floor (offset, file->page_size), 0); read_ahead (frame, file); @@ -596,7 +602,7 @@ ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) file = (ra_file_t *)(long)tmp_file; if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); + flush_region (frame, file, 0, file->pages.prev->offset+1, 0); } STACK_WIND (frame, ra_flush_cbk, FIRST_CHILD (this), @@ -624,7 +630,7 @@ ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) file = (ra_file_t *)(long)tmp_file; if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); + flush_region (frame, file, 0, file->pages.prev->offset+1, 0); } STACK_WIND (frame, ra_fsync_cbk, FIRST_CHILD (this), @@ -649,7 +655,7 @@ ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, file = frame->local; if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); + flush_region (frame, file, 0, file->pages.prev->offset+1, 1); } frame->local = NULL; @@ -673,7 +679,7 @@ ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, fd_ctx_get (fd, this, &tmp_file); file = (ra_file_t *)(long)tmp_file; if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); + flush_region (frame, file, 0, file->pages.prev->offset+1, 1); frame->local = file; /* reset the read-ahead counters too */ file->expected = file->page_count = 0; @@ -739,8 +745,16 @@ ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) if (!file) continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_ftruncate. + */ flush_region (frame, file, 0, - file->pages.prev->offset + 1); + file->pages.prev->offset + 1, 1); } } UNLOCK (&inode->lock); @@ -775,6 +789,8 @@ ra_page_dump (struct ra_page *page) gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write ("poisoned", "%s", page->poisoned ? "yes" : "no"); + gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no"); for (trav = page->waitq; trav; trav = trav->next) { @@ -870,7 +886,7 @@ ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) if (!file) continue; flush_region (frame, file, 0, - file->pages.prev->offset + 1); + file->pages.prev->offset + 1, 0); } } UNLOCK (&inode->lock); @@ -907,8 +923,16 @@ ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) file = (ra_file_t *)(long)tmp_file; if (!file) continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_truncate. + */ flush_region (frame, file, 0, - file->pages.prev->offset + 1); + file->pages.prev->offset + 1, 1); } } UNLOCK (&inode->lock); diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h index d0bbcde810f..f5e73cb6697 100644 --- a/xlators/performance/read-ahead/src/read-ahead.h +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -76,7 +76,8 @@ struct ra_page { struct ra_page *next; struct ra_page *prev; struct ra_file *file; - char dirty; + char dirty; /* Internal request, not from user. */ + char poisoned; /* Pending read invalidated by write. */ char ready; struct iovec *vector; int32_t count; |