From 6d9b11dba63d86c48450aa956281114962289ef5 Mon Sep 17 00:00:00 2001 From: Raghavendra G Date: Wed, 26 May 2010 06:08:09 +0000 Subject: performance/write-behind: explicitly enforce ordering of overlapping writes. - If there are non-contiguous offsets (offsets which do not start where previous write ended), wait for completion of previous writes to server, before sending new ones. - Send flush call to server only when all writes are completed. - If a file is opened with O_APPEND, at any point of time a maximum only one write call to server should be in transit. This is to avoid reordering of writes in the presence of afr which can result in data corruption. See bug #934 for more details. Signed-off-by: Raghavendra G Signed-off-by: Anand V. Avati BUG: 934 (md5sum mismatch when files are transferred using vsftpd) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=934 --- .../performance/write-behind/src/write-behind.c | 247 +++++++++++---------- 1 file changed, 125 insertions(+), 122 deletions(-) diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 883d25e9666..064e0b717c7 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -55,6 +55,7 @@ typedef struct wb_file { int32_t refcount; int32_t op_ret; int32_t op_errno; + int32_t flags; list_head_t request; list_head_t passive_requests; fd_t *fd; @@ -265,7 +266,7 @@ out: wb_file_t * -wb_file_create (xlator_t *this, fd_t *fd) +wb_file_create (xlator_t *this, fd_t *fd, int32_t flags) { wb_file_t *file = NULL; wb_conf_t *conf = this->private; @@ -287,6 +288,7 @@ wb_file_create (xlator_t *this, fd_t *fd) file->this = this; file->refcount = 1; file->window_conf = conf->window_size; + file->flags = flags; fd_ctx_set (fd, this, (uint64_t)(long)file); @@ -1228,7 +1230,7 @@ wb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, wbflags = local->wbflags; if (op_ret != -1) { - file = wb_file_create (this, fd); + file = wb_file_create (this, fd, flags); if (file == NULL) { op_ret = -1; op_errno = ENOMEM; @@ -1307,7 +1309,7 @@ wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, wb_conf_t *conf = this->private; if (op_ret != -1) { - file = wb_file_create (this, fd); + file = wb_file_create (this, fd, flags); if (file == NULL) { op_ret = -1; op_errno = ENOMEM; @@ -1338,8 +1340,8 @@ wb_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, frame->local = NULL; out: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, preparent, - postparent); + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent); return 0; } @@ -1358,14 +1360,22 @@ wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, return 0; } - +/* Mark all the contiguous write requests for winding starting from head of + * request list. Stops marking at the first non-write request found. If + * file is opened with O_APPEND, make sure all the writes marked for winding + * will fit into a single write call to server. + */ size_t __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds) { - wb_request_t *request = NULL; - size_t size = 0; - char first_request = 1; + wb_request_t *request = NULL; + size_t size = 0; + char first_request = 1; off_t offset_expected = 0; + wb_conf_t *conf = NULL; + int count = 0; + + conf = file->this->private; list_for_each_entry (request, list, list) { @@ -1384,9 +1394,18 @@ __wb_mark_wind_all (wb_file_t *file, list_head_t *list, list_head_t *winds) break; } + if ((file->flags & O_APPEND) + && (((size + request->write_size) + > conf->aggregate_size) + || ((count + request->stub->args.writev.count) + > conf->aggregate_size))) { + break; + } + size += request->write_size; offset_expected += request->write_size; file->aggregate_current -= request->write_size; + count += request->stub->args.writev.count; request->flags.write_request.stack_wound = 1; list_add_tail (&request->winds, winds); @@ -1461,15 +1480,14 @@ __wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf, request = list_entry (list->next, typeof (*request), list); file = request->file; - if (!wind_all && (file->aggregate_current < aggregate_conf)) { - __wb_can_wind (list, &other_fop_in_queue, - &non_contiguous_writes, &incomplete_writes); - } + __wb_can_wind (list, &other_fop_in_queue, + &non_contiguous_writes, &incomplete_writes); - if ((enable_trickling_writes && !incomplete_writes) - || (wind_all) || (non_contiguous_writes) - || (other_fop_in_queue) - || (file->aggregate_current >= aggregate_conf)) { + if (!incomplete_writes && ((enable_trickling_writes) + || (wind_all) || (non_contiguous_writes) + || (other_fop_in_queue) + || (file->aggregate_current + >= aggregate_conf))) { size = __wb_mark_wind_all (file, list, winds); } @@ -2087,64 +2105,66 @@ wb_ffr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno) { wb_local_t *local = NULL; - wb_file_t *file = NULL; - wb_conf_t *conf = NULL; - char unwind = 0; - int32_t ret = -1; - int disabled = 0; - int64_t disable_till = 0; + wb_file_t *file = NULL; + wb_conf_t *conf = NULL; + int32_t ret = -1; conf = this->private; local = frame->local; + file = local->file; - if ((local != NULL) && (local->file != NULL)) { - file = local->file; - + if (file != NULL) { LOCK (&file->lock); { - disabled = file->disabled; - disable_till = file->disable_till; + if (file->op_ret == -1) { + op_ret = file->op_ret; + op_errno = file->op_errno; + + file->op_ret = 0; + } } UNLOCK (&file->lock); - if (conf->flush_behind - && (!disabled) && (disable_till == 0)) { - unwind = 1; - } else { - local->reply_count++; - /* - * without flush-behind, unwind should wait for replies - * of writes queued before and the flush - */ - if (local->reply_count == 2) { - unwind = 1; - } + ret = wb_process_queue (frame, file, 0); + if ((ret == -1) && (errno == ENOMEM)) { + op_ret = -1; + op_errno = ENOMEM; } - } else { - unwind = 1; } + + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); - if (unwind) { - if (file != NULL) { - LOCK (&file->lock); - { - if (file->op_ret == -1) { - op_ret = file->op_ret; - op_errno = file->op_errno; + return 0; +} - file->op_ret = 0; - } - } - UNLOCK (&file->lock); - ret = wb_process_queue (frame, file, 0); - if ((ret == -1) && (errno == ENOMEM)) { - op_ret = -1; - op_errno = ENOMEM; - } - } +int32_t +wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + wb_conf_t *conf = NULL; + wb_local_t *local = NULL; + + conf = this->private; + + local = frame->local; + + if (local && local->request) { + local->request->stub = NULL; + wb_request_unref (local->request); + } - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); + if (conf->flush_behind) { + STACK_WIND (frame, + wb_ffr_bg_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + } else { + STACK_WIND (frame, + wb_ffr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); } return 0; @@ -2159,12 +2179,9 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) wb_local_t *local = NULL; uint64_t tmp_file = 0; call_stub_t *stub = NULL; - call_frame_t *process_frame = NULL; - wb_local_t *tmp_local = NULL; + call_frame_t *flush_frame = NULL; wb_request_t *request = NULL; int32_t ret = 0; - int disabled = 0; - int64_t disable_till = 0; conf = this->private; @@ -2183,93 +2200,79 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) if (file != NULL) { local = CALLOC (1, sizeof (*local)); if (local == NULL) { - STACK_UNWIND (frame, -1, ENOMEM, NULL); + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); return 0; } local->file = file; - frame->local = local; - stub = fop_flush_cbk_stub (frame, wb_ffr_cbk, 0, 0); - if (stub == NULL) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); - return 0; + if (conf->flush_behind) { + flush_frame = copy_frame (frame); + if (flush_frame == NULL) { + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); + return 0; + } + } else { + flush_frame = frame; } - process_frame = copy_frame (frame); - if (process_frame == NULL) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); - call_stub_destroy (stub); - return 0; - } + flush_frame->local = local; - LOCK (&file->lock); - { - disabled = file->disabled; - disable_till = file->disable_till; - } - UNLOCK (&file->lock); - - if (conf->flush_behind - && (!disabled) && (disable_till == 0)) { - tmp_local = CALLOC (1, sizeof (*local)); - if (tmp_local == NULL) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); - - STACK_DESTROY (process_frame->root); - call_stub_destroy (stub); - return 0; + stub = fop_flush_stub (flush_frame, wb_flush_helper, fd); + if (stub == NULL) { + if (flush_frame != frame) { + STACK_DESTROY (flush_frame->root); } - tmp_local->file = file; - process_frame->local = tmp_local; + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); + return 0; } - fd_ref (fd); - request = wb_enqueue (file, stub); if (request == NULL) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); + if (flush_frame != frame) { + STACK_DESTROY (flush_frame->root); + } - fd_unref (fd); + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); call_stub_destroy (stub); - STACK_DESTROY (process_frame->root); return 0; } - ret = wb_process_queue (process_frame, file, 1); + ret = wb_process_queue (flush_frame, file, 1); if ((ret == -1) && (errno == ENOMEM)) { - STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); + if (flush_frame != frame) { + STACK_DESTROY (flush_frame->root); + } - fd_unref (fd); + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); call_stub_destroy (stub); - STACK_DESTROY (process_frame->root); return 0; } - } - - if ((file != NULL) && conf->flush_behind - && (!disabled) && (disable_till == 0)) { - STACK_WIND (process_frame, - wb_ffr_bg_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); } else { - STACK_WIND (frame, - wb_ffr_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->flush, - fd); + if (conf->flush_behind) { + flush_frame = copy_frame (frame); + if (flush_frame == NULL) { + STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM); + return 0; + } - if (process_frame != NULL) { - STACK_DESTROY (process_frame->root); + STACK_WIND (flush_frame, + wb_ffr_bg_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + } else { + STACK_WIND (frame, + wb_ffr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); } } - - if (file != NULL) { - fd_unref (fd); + if (conf->flush_behind) { + STACK_UNWIND_STRICT (flush, frame, 0, 0); } return 0; -- cgit