diff options
author | Brian Foster <bfoster@redhat.com> | 2012-07-19 15:01:13 -0400 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2012-07-30 11:39:03 -0700 |
commit | 879c01087d58742515524664e8a193a04a0d4146 (patch) | |
tree | 9c824d09ea4a814e085f9df6b2ff292a767c6868 /xlators/cluster/stripe/src/stripe.c | |
parent | b3025cb8dd79751c12b7bb375ec701b532688cbc (diff) |
cluster/stripe: handle short writes and errors in writev callback
cluster/stripe write callback handling is broken in the event of
server side errors and short writes due to crudely summing up the
return values from each node. This can produce incorrect results
or cause an application to rewrite the wrong portions of a buffer
in an attempt to handle this condition.
Modify cluster/stripe writev handling to record the requested size
of each write and use this data to return the number of consecutive
bytes written from the original request. This allows an application
to retry a write at the point of error (and potentially consume
said error).
BUG: 809975
Change-Id: Ic35cb1e092c29545205aa32e352485c507534ce0
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-on: http://review.gluster.com/3700
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Shishir Gowda <sgowda@redhat.com>
Reviewed-by: Amar Tumballi <amarts@redhat.com>
Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/cluster/stripe/src/stripe.c')
-rw-r--r-- | xlators/cluster/stripe/src/stripe.c | 136 |
1 files changed, 102 insertions, 34 deletions
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index e2176eeae66..6588a44996c 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -3482,8 +3482,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, frame->local = local; /* This is where all the vectors should be copied. */ - local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies), - gf_stripe_mt_readv_replies); + local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies), + gf_stripe_mt_stripe_replies); if (!local->replies) { op_errno = ENOMEM; goto err; @@ -3543,7 +3543,11 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { int32_t callcnt = 0; stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + struct stripe_replies *reply = NULL; + int32_t i = 0; if (!this || !frame || !frame->local || !cookie) { gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); @@ -3552,48 +3556,75 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; LOCK(&frame->lock); { - callcnt = ++local->call_count; + callcnt = ++mlocal->call_count; + + mlocal->replies[local->node_index].op_ret = op_ret; + mlocal->replies[local->node_index].op_errno = op_errno; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s returned error %s", - prev->this->name, strerror (op_errno)); - local->op_errno = op_errno; - local->op_ret = -1; - } if (op_ret >= 0) { - local->op_ret += op_ret; - local->post_buf = *postbuf; - local->pre_buf = *prebuf; + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; - local->prebuf_blocks += prebuf->ia_blocks; - local->postbuf_blocks += postbuf->ia_blocks; + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; - correct_file_size(prebuf, local->fctx, prev); - correct_file_size(postbuf, local->fctx, prev); + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); - if (local->prebuf_size < prebuf->ia_size) - local->prebuf_size = prebuf->ia_size; - if (local->postbuf_size < postbuf->ia_size) - local->postbuf_size = postbuf->ia_size; + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; } } UNLOCK (&frame->lock); - if ((callcnt == local->wind_count) && local->unwind) { - local->pre_buf.ia_size = local->prebuf_size; - local->pre_buf.ia_blocks = local->prebuf_blocks; - local->post_buf.ia_size = local->postbuf_size; - local->post_buf.ia_blocks = local->postbuf_blocks; + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + /* + * Only return the number of consecutively written bytes up until + * the first error. Only return an error if it occurs first. + * + * When a short write occurs, the application should retry at the + * appropriate offset, at which point we'll potentially pass back + * the error. + */ + for (i = 0, reply = mlocal->replies; i < mlocal->wind_count; + i++, reply++) { + if (reply->op_ret == -1) { + gf_log(this->name, GF_LOG_DEBUG, "reply %d " + "returned error %s", i, + strerror(reply->op_errno)); + if (!mlocal->op_ret) { + mlocal->op_ret = -1; + mlocal->op_errno = reply->op_errno; + } + break; + } - STRIPE_STACK_UNWIND (writev, frame, local->op_ret, - local->op_errno, &local->pre_buf, - &local->post_buf, NULL); + mlocal->op_ret += reply->op_ret; + + if (reply->op_ret < reply->requested_size) + break; + } + + GF_FREE(mlocal->replies); + + STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); } out: + STRIPE_STACK_DESTROY(frame); return 0; } @@ -3615,6 +3646,11 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, uint64_t stripe_size = 0; uint64_t tmp_fctx = 0; off_t dest_offset = 0; + off_t rounded_start = 0; + off_t rounded_end = 0; + int32_t total_chunks = 0; + call_frame_t *wframe = NULL; + stripe_local_t *wlocal = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3653,7 +3689,27 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } + rounded_start = floor(offset, stripe_size); + rounded_end = roof(offset + total_size, stripe_size); + total_chunks = (rounded_end - rounded_start) / stripe_size; + local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies), + gf_stripe_mt_stripe_replies); + if (!local->replies) { + op_errno = ENOMEM; + goto err; + } + + total_chunks = 0; while (1) { + wframe = copy_frame(frame); + wlocal = mem_get0(this->local_pool); + if (!wlocal) { + op_errno = ENOMEM; + goto err; + } + wlocal->orig_frame = frame; + wframe->local = wlocal; + /* Send striped chunk of the vector to child nodes appropriately. */ idx = (((offset + offset_offset) / @@ -3681,25 +3737,37 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (remaining_size == 0) local->unwind = 1; + /* + * Store off the request index (with respect to the chunk of the + * initial offset) and the size of the request. This is required + * in the callback to calculate an appropriate return value in + * the event of a write failure in one or more requests. + */ + wlocal->node_index = total_chunks; + local->replies[total_chunks].requested_size = fill_size; + + dest_offset = offset + offset_offset; if (fctx->stripe_coalesce) - dest_offset = coalesced_offset(offset + offset_offset, - local->stripe_size, fctx->stripe_count); - else - dest_offset = offset + offset_offset; + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); - STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx], + STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx], fctx->xl_array[idx]->fops->writev, fd, tmp_vec, tmp_count, dest_offset, flags, iobref, xdata); GF_FREE (tmp_vec); offset_offset += fill_size; + total_chunks++; if (remaining_size == 0) break; } return 0; err: + if (wframe) + STRIPE_STACK_DESTROY(wframe); + STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } |