diff options
author | Brian Foster <bfoster@redhat.com> | 2012-07-19 15:01:13 -0400 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2012-07-30 11:39:03 -0700 |
commit | 879c01087d58742515524664e8a193a04a0d4146 (patch) | |
tree | 9c824d09ea4a814e085f9df6b2ff292a767c6868 | |
parent | b3025cb8dd79751c12b7bb375ec701b532688cbc (diff) |
cluster/stripe: handle short writes and errors in writev callback
cluster/stripe write callback handling is broken in the event of
server side errors and short writes due to crudely summing up the
return values from each node. This can produce incorrect results
or cause an application to rewrite the wrong portions of a buffer
in an attempt to handle this condition.
Modify cluster/stripe writev handling to record the requested size
of each write and use this data to return the number of consecutive
bytes written from the original request. This allows an application
to retry a write at the point of error (and potentially consume
said error).
BUG: 809975
Change-Id: Ic35cb1e092c29545205aa32e352485c507534ce0
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-on: http://review.gluster.com/3700
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Shishir Gowda <sgowda@redhat.com>
Reviewed-by: Amar Tumballi <amarts@redhat.com>
Reviewed-by: Anand Avati <avati@redhat.com>
-rw-r--r-- | xlators/cluster/stripe/src/stripe-mem-types.h | 2 | ||||
-rw-r--r-- | xlators/cluster/stripe/src/stripe.c | 136 | ||||
-rw-r--r-- | xlators/cluster/stripe/src/stripe.h | 6 |
3 files changed, 106 insertions, 38 deletions
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h index c8781d7d761..e05ba0c29c3 100644 --- a/xlators/cluster/stripe/src/stripe-mem-types.h +++ b/xlators/cluster/stripe/src/stripe-mem-types.h @@ -16,7 +16,7 @@ enum gf_stripe_mem_types_ { gf_stripe_mt_iovec = gf_common_mt_end + 1, - gf_stripe_mt_readv_replies, + gf_stripe_mt_stripe_replies, gf_stripe_mt_stripe_fd_ctx_t, gf_stripe_mt_char, gf_stripe_mt_int8_t, diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index e2176eeae66..6588a44996c 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -3482,8 +3482,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, frame->local = local; /* This is where all the vectors should be copied. */ - local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies), - gf_stripe_mt_readv_replies); + local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies), + gf_stripe_mt_stripe_replies); if (!local->replies) { op_errno = ENOMEM; goto err; @@ -3543,7 +3543,11 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + struct stripe_replies *reply = NULL; + int32_t i = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3552,48 +3556,75 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = ++local->call_count; + callcnt = ++mlocal->call_count; + + mlocal->replies[local->node_index].op_ret = op_ret; + mlocal->replies[local->node_index].op_errno = op_errno; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } if (op_ret >= 0) { - local->op_ret += op_ret; - local->post_buf = *postbuf; - local->pre_buf = *prebuf; + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; - local->prebuf_blocks += prebuf->ia_blocks; - local->postbuf_blocks += postbuf->ia_blocks; + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; - correct_file_size(prebuf, local->fctx, prev); - correct_file_size(postbuf, local->fctx, prev); + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); - if (local->prebuf_size < prebuf->ia_size) - local->prebuf_size = prebuf->ia_size; - if (local->postbuf_size < postbuf->ia_size) - local->postbuf_size = postbuf->ia_size; + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); - if ((callcnt == local->wind_count) && local->unwind) { - local->pre_buf.ia_size = local->prebuf_size; - local->pre_buf.ia_blocks = local->prebuf_blocks; - local->post_buf.ia_size = local->postbuf_size; - local->post_buf.ia_blocks = local->postbuf_blocks; + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + /* + * Only return the number of consecutively written bytes up until + * the first error. Only return an error if it occurs first. + * + * When a short write occurs, the application should retry at the + * appropriate offset, at which point we'll potentially pass back + * the error. + */ + for (i = 0, reply = mlocal->replies; i < mlocal->wind_count; + i++, reply++) { + if (reply->op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "reply %d " + "returned error %s", i, + strerror(reply->op_errno)); + if (!mlocal->op_ret) { + mlocal->op_ret = -1; + mlocal->op_errno = reply->op_errno; + } + break; + } - STRIPE_STACK_UNWIND (writev, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf, NULL); + mlocal->op_ret += reply->op_ret; + + if (reply->op_ret < reply->requested_size) + break; + } + + GF_FREE(mlocal->replies); + + STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); } out: + STRIPE_STACK_DESTROY(frame); return 0; } @@ -3615,6 +3646,11 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; off_t dest_offset = 0; + off_t rounded_start = 0; + off_t rounded_end = 0; + int32_t total_chunks = 0; + call_frame_t *wframe = NULL; + stripe_local_t *wlocal = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3653,7 +3689,27 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } + rounded_start = floor(offset, stripe_size); + rounded_end = roof(offset + total_size, stripe_size); + total_chunks = (rounded_end - rounded_start) / stripe_size; + local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies), + gf_stripe_mt_stripe_replies); + if (!local->replies) { + op_errno = ENOMEM; + goto err; + } + + total_chunks = 0; while (1) { + wframe = copy_frame(frame); + wlocal = mem_get0(this->local_pool); + if (!wlocal) { + op_errno = ENOMEM; + goto err; + } + wlocal->orig_frame = frame; + wframe->local = wlocal; + /* Send striped chunk of the vector to child nodes appropriately. */ idx = (((offset + offset_offset) / @@ -3681,25 +3737,37 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (remaining_size == 0) local->unwind = 1; + /* + * Store off the request index (with respect to the chunk of the + * initial offset) and the size of the request. This is required + * in the callback to calculate an appropriate return value in + * the event of a write failure in one or more requests. + */ + wlocal->node_index = total_chunks; + local->replies[total_chunks].requested_size = fill_size; + + dest_offset = offset + offset_offset; if (fctx->stripe_coalesce) - dest_offset = coalesced_offset(offset + offset_offset, - local->stripe_size, fctx->stripe_count); - else - dest_offset = offset + offset_offset; + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); - STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], + STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->writev, fd, tmp_vec, tmp_count, dest_offset, flags, iobref, xdata); GF_FREE (tmp_vec); offset_offset += fill_size; + total_chunks++; if (remaining_size == 0) break; } return 0; err: + if (wframe) + STRIPE_STACK_DESTROY(wframe); + STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index 1b9e660c126..a440f87ba1f 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -106,9 +106,9 @@ struct stripe_private { }; /** - * Used to keep info about the replies received from fops->readv calls + * Used to keep info about the replies received from readv/writev calls */ -struct readv_replies { +struct stripe_replies { struct iovec *vector; int32_t count; //count of vector int32_t op_ret; //op_ret of readv @@ -156,7 +156,7 @@ struct stripe_local { blkcnt_t preparent_blocks; blkcnt_t postparent_blocks; - struct readv_replies *replies; + struct stripe_replies *replies; struct statvfs statvfs_buf; dir_entry_t *entry; |