diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2014-11-08 21:46:41 +0100 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-01-28 19:49:29 -0800 |
commit | b17122ffc75c65bda2cf3b3d99832bbf2718e8d3 (patch) | |
tree | 258ad51bcf7d08bccc96cd94b7440a0dce227f33 /xlators/cluster/ec/src | |
parent | 88136b53f59e3b81aacc28df18bda575da35b02d (diff) |
ec: Fix posix compliance failures
This patch solves some problems that caused dispersed volumes to not
pass posix smoke tests:
* Problems in open/create with O_WRONLY
Opening files with -w- permissions using O_WRONLY returned an EACCES
error because internally O_WRONLY was replaced with O_RDWR.
* Problems with entrylk on renames.
When source and destination were the same, ec tried to acquire
the same entrylk twice, causing a deadlock.
* Overwrite of a variable when reordering locks.
On a rename, if the second lock needed to be placed at the beggining
of the list, the 'lock' variable was overwritten and later its timer
was cancelled, cancelling the incorrect one.
* Handle O_TRUNC in open.
When O_TRUNC was received in an open call, it was blindly propagated
to child subvolumes. This caused a discrepancy between real file
size and the size stored into trusted.ec.size xattr. This has been
solved by removing O_TRUNC from open and later calling ftruncate.
Change-Id: I20c3d6e1c11be314be86879be54b728e01013798
BUG: 1161886
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/9420
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 54 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 8 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 13 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 50 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 62 |
6 files changed, 101 insertions, 87 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 89c78c69bae..fcae083aa84 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -232,6 +232,16 @@ void ec_fop_set_error(ec_fop_data_t * fop, int32_t error) UNLOCK(&fop->lock); } +void ec_sleep(ec_fop_data_t *fop) +{ + LOCK(&fop->lock); + + fop->refs++; + fop->jobs++; + + UNLOCK(&fop->lock); +} + int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume) { int32_t error = -1; @@ -435,12 +445,7 @@ int32_t ec_child_select(ec_fop_data_t * fop) return 0; } - LOCK(&fop->lock); - - fop->jobs++; - fop->refs++; - - UNLOCK(&fop->lock); + ec_sleep(fop); return 1; } @@ -637,22 +642,23 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2) ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update) { - ec_lock_t * tmp; + ec_lock_t *new_lock, *tmp; ec_lock_link_t *link = NULL; int32_t tmp_update; + new_lock = lock; if ((fop->lock_count > 0) && - (ec_lock_compare(fop->locks[0].lock, lock) > 0)) + (ec_lock_compare(fop->locks[0].lock, new_lock) > 0)) { tmp = fop->locks[0].lock; - fop->locks[0].lock = lock; - lock = tmp; + fop->locks[0].lock = new_lock; + new_lock = tmp; tmp_update = fop->locks_update; fop->locks_update = update; update = tmp_update; } - fop->locks[fop->lock_count].lock = lock; + fop->locks[fop->lock_count].lock = new_lock; fop->locks[fop->lock_count].fop = fop; fop->locks_update |= update << fop->lock_count; @@ -693,6 +699,16 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) return; } + + /* If there's another lock, make sure that it's not the same. Otherwise + * do not insert it. + * + * This can only happen on renames where source and target names are + * in the same directory. */ + if ((fop->lock_count > 0) && + (fop->locks[0].lock->loc.inode == tmp.inode)) { + goto wipe; + } } else { if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) { ec_fop_set_error(fop, EIO); @@ -742,6 +758,7 @@ insert: unlock: UNLOCK(&tmp.inode->lock); +wipe: loc_wipe(&tmp); if (link != NULL) { @@ -870,12 +887,7 @@ void ec_lock(ec_fop_data_t * fop) list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting); - LOCK(&fop->lock); - - fop->jobs++; - fop->refs++; - - UNLOCK(&fop->lock); + ec_sleep(fop); UNLOCK(&lock->loc.inode->lock); @@ -1332,12 +1344,7 @@ void ec_unlock_timer_add(ec_lock_link_t *link) delay.tv_sec = 1; delay.tv_nsec = 0; - LOCK(&fop->lock); - - fop->jobs++; - fop->refs++; - - UNLOCK(&fop->lock); + ec_sleep(fop); /* If healing is needed, do not delay lock release to let self-heal * start working as soon as possible. */ @@ -1356,6 +1363,7 @@ void ec_unlock_timer_add(ec_lock_link_t *link) refs = 0; } } else { + ec_trace("UNLOCK_FORCE", fop, "lock=%p", lock); *lock->plock = NULL; refs = 0; } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index a0f5c0bc290..2b1d9574cdf 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -93,6 +93,7 @@ void ec_dispatch_one(ec_fop_data_t * fop); void ec_wait_winds(ec_fop_data_t * fop); +void ec_sleep(ec_fop_data_t *fop); void ec_resume(ec_fop_data_t * fop, int32_t error); void ec_resume_parent(ec_fop_data_t * fop, int32_t error); diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index e8d96272987..d48be20470d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -158,10 +158,7 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) return EC_STATE_REPORT; } - if (ctx->flags == 0) - { - ctx->flags = fop->int32; - } + ctx->flags = fop->int32; UNLOCK(&fop->fd->lock); @@ -207,8 +204,7 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) /* We need to write to specific offsets on the bricks, so we * need to remove O_APPEND from flags (if present) */ - fop->int32 &= ~(O_ACCMODE | O_APPEND); - fop->int32 |= O_RDWR; + fop->int32 &= ~O_APPEND; /* Fall through */ diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index d37a657de02..c49ccf3fd72 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -783,7 +783,6 @@ ec_cbk_data_t * ec_heal_lookup_check(ec_heal_t * heal, uintptr_t * pgood, void ec_heal_prepare(ec_heal_t * heal) { ec_cbk_data_t * cbk; - ec_fd_t * ctx; int32_t error = ENOMEM; heal->available = heal->good; @@ -814,13 +813,6 @@ void ec_heal_prepare(ec_heal_t * heal) goto out; } - ctx = ec_fd_get(heal->fd, heal->xl); - if ((ctx == NULL) || (loc_copy(&ctx->loc, &heal->loc) != 0)) - { - goto out; - } - - ctx->flags = O_RDWR; } if (heal->iatt.ia_type == IA_IFLNK) @@ -1057,11 +1049,6 @@ void ec_heal_reopen_fd(ec_heal_t * heal) else { flags = ctx_fd->flags & ~(O_TRUNC | O_APPEND); - if ((flags & O_ACCMODE) == O_WRONLY) - { - flags &= ~O_ACCMODE; - flags |= O_RDWR; - } ec_open(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE, ec_heal_reopen_cbk, NULL, &heal->loc, flags, fd, diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 9d860161ecf..3483dfb3354 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -697,6 +697,26 @@ void ec_wind_open(ec_t * ec, ec_fop_data_t * fop, int32_t idx) &fop->loc[0], fop->int32, fop->fd, fop->xdata); } +int32_t ec_open_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + int32_t error = 0; + + fop = fop->data; + if (op_ret >= 0) { + fop->answer->iatt[0] = *postbuf; + } else { + error = op_errno; + } + + ec_resume(fop, error); + + return 0; +} + int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state) { ec_cbk_data_t * cbk; @@ -717,21 +737,18 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state) return EC_STATE_REPORT; } - if (ctx->flags == 0) - { - ctx->flags = fop->int32; - } + ctx->flags = fop->int32; UNLOCK(&fop->fd->lock); - if ((fop->int32 & O_ACCMODE) == O_WRONLY) - { - fop->int32 &= ~O_ACCMODE; - fop->int32 |= O_RDWR; - } /* We need to write to specific offsets on the bricks, so we - * need to remove O_APPEND from flags (if present) */ - fop->int32 &= ~O_APPEND; + need to remove O_APPEND from flags (if present). + If O_TRUNC is specified, we remove it from open and an + ftruncate will be executed later, which will correctly update + the file size taking appropriate locks. O_TRUNC flag is saved + into fop->uint32 to use it later.*/ + fop->uint32 = fop->int32 & O_TRUNC; + fop->int32 &= ~(O_APPEND | O_TRUNC); /* Fall through */ @@ -766,6 +783,17 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state) } UNLOCK(&fop->fd->lock); + + /* If O_TRUNC was specified, call ftruncate to + effectively trunc the file with appropriate locks + acquired. We don't use ctx->flags because self-heal + can use the same fd with different flags. */ + if (fop->uint32 != 0) { + ec_sleep(fop); + ec_ftruncate(fop->req_frame, fop->xl, cbk->mask, + fop->minimum, ec_open_truncate_cbk, + fop, cbk->fd, 0, NULL); + } } } if (cbk->op_ret < 0) { diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index a48ea09926a..140d59f5f20 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -1248,8 +1248,6 @@ int32_t ec_truncate_open_cbk(call_frame_t * frame, void * cookie, int32_t ec_truncate_clean(ec_fop_data_t * fop) { - ec_fd_t * ctx; - if (fop->fd == NULL) { fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid); @@ -1257,13 +1255,6 @@ int32_t ec_truncate_clean(ec_fop_data_t * fop) { return 0; } - ctx = ec_fd_get(fop->fd, fop->xl); - if ((ctx == NULL) || (loc_copy(&ctx->loc, &fop->loc[0]) != 0)) - { - return 0; - } - - ctx->flags = O_RDWR; ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum, ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, @@ -1701,20 +1692,6 @@ out: /* FOP: writev */ -int32_t ec_writev_init(ec_fop_data_t * fop) -{ - ec_fd_t * ctx; - - ctx = ec_fd_get(fop->fd, fop->xl); - if (ctx != NULL) { - if ((ctx->flags & O_ACCMODE) == O_RDONLY) { - return EBADF; - } - } - - return 0; -} - int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, struct iovec * vector, int32_t count, @@ -1787,14 +1764,29 @@ int32_t ec_writev_merge_head(call_frame_t * frame, void * cookie, return 0; } -void ec_writev_start(ec_fop_data_t * fop) +void ec_writev_start(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; struct iobref *iobref = NULL; struct iobuf *iobuf = NULL; void *ptr = NULL; ec_fd_t *ctx; + fd_t *fd; size_t tail; + uid_t uid; + gid_t gid; + + fd = fd_anonymous(fop->fd->inode); + if (fd == NULL) { + ec_fop_set_error(fop, EIO); + + return; + } + + uid = fop->frame->root->uid; + fop->frame->root->uid = 0; + gid = fop->frame->root->gid; + fop->frame->root->gid = 0; ctx = ec_fd_get(fop->fd, fop->xl); if (ctx != NULL) { @@ -1833,7 +1825,7 @@ void ec_writev_start(ec_fop_data_t * fop) if (fop->head > 0) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, - NULL, fop->fd, ec->stripe_size, fop->offset, 0, NULL); + NULL, fd, ec->stripe_size, fop->offset, 0, NULL); } tail = fop->size - fop->user_size - fop->head; if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) @@ -1841,7 +1833,7 @@ void ec_writev_start(ec_fop_data_t * fop) if (fop->pre_size > fop->offset + fop->head + fop->user_size) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, - ec_writev_merge_tail, NULL, fop->fd, ec->stripe_size, + ec_writev_merge_tail, NULL, fd, ec->stripe_size, fop->offset + fop->size - ec->stripe_size, 0, NULL); } else @@ -1850,6 +1842,11 @@ void ec_writev_start(ec_fop_data_t * fop) } } + fop->frame->root->uid = uid; + fop->frame->root->gid = gid; + + fd_unref(fd); + return; out: @@ -1860,6 +1857,11 @@ out: iobref_unref(iobref); } + fop->frame->root->uid = uid; + fop->frame->root->gid = gid; + + fd_unref(fd); + ec_fop_set_error(fop, EIO); } @@ -2007,14 +2009,6 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) switch (state) { case EC_STATE_INIT: - fop->error = ec_writev_init(fop); - if (fop->error != 0) - { - return EC_STATE_REPORT; - } - - /* Fall through */ - case EC_STATE_LOCK: ec_lock_prepare_fd(fop, fop->fd, 1); ec_lock(fop); |