diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2014-10-15 09:44:55 +0200 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-10-27 07:18:51 -0700 |
commit | 408d4454870e7374bc9c4060c6c23d224d5174a2 (patch) | |
tree | 01a0e65e9e825dadb23e8a14c4838b2e93eb7519 /xlators/cluster/ec | |
parent | 8ea1a4844975940013d8704f87ee137dcb27bfb5 (diff) |
ec: Fix rebalance issues
Some issues in ec xlator made that rebalance didn't complete
successfully and generated some warnings and errors in the
log. The most critical error was a race condition that caused
false corruption detection when two specific operations were
executed sequentially and they shared the same lock.
This explains the problem:
1. A setxattr is issued.
2. setxattr: ec locks the inode before updating the xattr.
3. setxattr: The xattr is updated.
4. setxattr: Upper xlator is notified that the operation completed.
5. setxattr: A background task is initiated to update the version
of the file.
6. A stat is issued on the same file.
7. stat: Since the lock is already acquired, it's reused.
8. stat: A lookup is issued to determine version and size
information of the file.
At this point, operations 5 and 8 can interfere. This can make that
lookup sees different information on each brick, determining that
some bricks are corrupted and incorrectly excluding them from the
operation and initiating a self-heal. In some cases this false
detection combined with self-heal could lead to invalid updates of
the trusted.ec.size xattr, leaving the file smaller than it should
be.
This only happens if the first operation does not perform a lookup,
because chained operations reuse the information returned by the
previous one, avoiding this kind of problems.
To solve this, now the background update is executed atomically with
the posterior unlock. This avoids some reuses of the lock while
updating. However this reduces performance because the window in
which new requests can reuse the lock is much smaller now. This has
been alleviated by using the same technique implemented in AFR (i.e.
waiting some time before releasing the lock).
Some minor changes also introduced in this patch:
* Bug in management of 'trusted.glusterfs.pathinfo' that was writing
beyond the allocated space.
* Uninitialized variable.
* trusted.ec.config was not created for regular files created with
mknod.
* An invalid state was used in access fop.
This is a backport of http://review.gluster.org/8947/
Change-Id: Idfaf69578ed04dbac97a62710326729715b9b395
BUG: 1152903
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/8948
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Diffstat (limited to 'xlators/cluster/ec')
-rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 6 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 292 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 30 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 2 |
5 files changed, 218 insertions, 113 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 0fa5ac068de..93593fb8b6a 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -326,8 +326,10 @@ int32_t ec_dict_data_concat(const char * fmt, ec_cbk_data_t * cbk, len = prelen; for (i = 0; i < num; i++) { - memcpy(str + len, sep, seplen); - len += seplen; + if (i > 0) { + memcpy(str + len, sep, seplen); + len += seplen; + } tmp = data[i]->len - 1; memcpy(str + len, data[i]->data, tmp); len += tmp; diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 561871cee93..2ba17305411 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -631,9 +631,11 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2) return uuid_compare(lock1->loc.gfid, lock2->loc.gfid); } -void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update) +ec_lock_link_t *ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, + int32_t update) { ec_lock_t * tmp; + ec_lock_link_t *link = NULL; int32_t tmp_update; if ((fop->lock_count > 0) && @@ -654,13 +656,23 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update) fop->lock_count++; - lock->refs++; + if (lock->timer != NULL) { + link = lock->timer->data; + ec_trace("UNLOCK_CANCELLED", link->fop, "lock=%p", lock); + gf_timer_call_cancel(fop->xl->ctx, lock->timer); + lock->timer = NULL; + } else { + lock->refs++; + } + + return link; } void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) { ec_lock_t * lock = NULL; ec_inode_t * ctx = NULL; + ec_lock_link_t *link = NULL; loc_t tmp; int32_t error; @@ -724,16 +736,21 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) ctx->entry_lock = lock; insert: - ec_lock_insert(fop, lock, update); + link = ec_lock_insert(fop, lock, update); unlock: UNLOCK(&tmp.inode->lock); loc_wipe(&tmp); + + if (link != NULL) { + ec_resume(link->fop, 0); + } } void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) { + ec_lock_link_t *link = NULL; ec_lock_t * lock; ec_inode_t * ctx; @@ -778,10 +795,14 @@ void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) ctx->inode_lock = lock; insert: - ec_lock_insert(fop, lock, update); + link = ec_lock_insert(fop, lock, update); unlock: UNLOCK(&loc->inode->lock); + + if (link != NULL) { + ec_resume(link->fop, 0); + } } void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) @@ -898,90 +919,6 @@ void ec_lock(ec_fop_data_t * fop) } } -int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this, - int32_t op_ret, int32_t op_errno, dict_t * xdata) -{ - ec_fop_data_t * fop = cookie; - - if (op_ret < 0) - { - gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", - ec_fop_name(fop->parent->id)); - } - else - { - ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); - } - - return 0; -} - -void ec_unlock(ec_fop_data_t * fop) -{ - ec_lock_t * lock; - int32_t i, refs; - - for (i = 0; i < fop->lock_count; i++) - { - lock = fop->locks[i].lock; - - LOCK(&lock->loc.inode->lock); - - ec_trace("UNLOCK", fop, "lock=%p", lock); - - refs = --lock->refs; - if (refs == 0) - { - *lock->plock = NULL; - } - - UNLOCK(&lock->loc.inode->lock); - - if (refs == 0) - { - if (lock->mask != 0) - { - ec_owner_set(fop->frame, lock); - - switch (lock->kind) - { - case EC_LOCK_ENTRY: - ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, " - "path=%s", - lock, lock->loc.inode, lock->loc.path); - - ec_entrylk(fop->frame, fop->xl, lock->mask, - EC_MINIMUM_ALL, ec_unlocked, lock, - fop->xl->name, &lock->loc, NULL, - ENTRYLK_UNLOCK, lock->type, NULL); - - break; - - case EC_LOCK_INODE: - lock->flock.l_type = F_UNLCK; - ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", - lock, lock->loc.inode); - - ec_inodelk(fop->frame, fop->xl, lock->mask, - EC_MINIMUM_ALL, ec_unlocked, lock, - fop->xl->name, &lock->loc, F_SETLK, - &lock->flock, NULL); - - break; - - default: - gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock " - "type"); - } - } - - ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); - - ec_lock_destroy(lock); - } - } -} - int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, inode_t * inode, @@ -991,7 +928,7 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, ec_t * ec; ec_fop_data_t * fop = cookie; ec_inode_t * ctx; - ec_lock_t * lock; + ec_lock_t *lock = NULL; if (op_ret >= 0) { @@ -1192,6 +1129,58 @@ out: ec_fop_set_error(fop, error); } +int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "entry/inode unlocking failed (%s)", + ec_fop_name(fop->parent->id)); + } else { + ec_trace("UNLOCKED", fop->parent, "lock=%p", fop->data); + } + + return 0; +} + +void ec_unlock_lock(ec_fop_data_t *fop, ec_lock_t *lock) +{ + if (lock->mask != 0) { + ec_owner_set(fop->frame, lock); + + switch (lock->kind) { + case EC_LOCK_ENTRY: + ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s", lock, + lock->loc.inode, lock->loc.path); + + ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, + ec_unlocked, lock, fop->xl->name, &lock->loc, NULL, + ENTRYLK_UNLOCK, lock->type, NULL); + + break; + + case EC_LOCK_INODE: + lock->flock.l_type = F_UNLCK; + ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, + lock->loc.inode); + + ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, + ec_unlocked, lock, fop->xl->name, &lock->loc, F_SETLK, + &lock->flock, NULL); + + break; + + default: + gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type"); + } + } + + ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); + + ec_lock_destroy(lock); +} + int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, dict_t * xattr, @@ -1209,11 +1198,15 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, fop->parent->mask &= fop->good; } + if (fop->data != NULL) { + ec_unlock_lock(fop->parent, fop->data); + } + return 0; } void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version, - uint64_t size) + uint64_t size, ec_lock_t *lock) { dict_t * dict; uid_t uid; @@ -1253,7 +1246,7 @@ void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version, fop->frame->root->gid = 0; ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, - ec_update_size_version_done, NULL, loc, + ec_update_size_version_done, lock, loc, GF_XATTROP_ADD_ARRAY64, dict, NULL); fop->frame->root->uid = uid; @@ -1274,6 +1267,103 @@ out: gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size"); } +void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock) +{ + ec_trace("UNLOCK_NOW", fop, "lock=%p", lock); + + if (lock->version_delta != 0) { + ec_update_size_version(fop, &lock->loc, lock->version_delta, + lock->size_delta, lock); + } else { + ec_unlock_lock(fop, lock); + } + + ec_resume(fop, 0); +} + +void ec_unlock_timer_cbk(void *data) +{ + ec_lock_link_t *link = data; + ec_lock_t *lock = link->lock; + ec_fop_data_t *fop = NULL; + + LOCK(&lock->loc.inode->lock); + + if (lock->timer != NULL) { + fop = link->fop; + + ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); + + GF_ASSERT(lock->refs == 1); + + gf_timer_call_cancel(fop->xl->ctx, lock->timer); + lock->timer = NULL; + *lock->plock = NULL; + } + + UNLOCK(&lock->loc.inode->lock); + + if (fop != NULL) { + ec_unlock_now(fop, lock); + } +} + +void ec_unlock_timer_add(ec_lock_link_t *link) +{ + struct timespec delay; + ec_fop_data_t *fop = link->fop; + ec_lock_t *lock = link->lock; + int32_t refs = 1; + + LOCK(&lock->loc.inode->lock); + + GF_ASSERT(lock->timer == NULL); + + if (lock->refs != 1) { + ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); + + lock->refs--; + + UNLOCK(&lock->loc.inode->lock); + } else { + ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); + + delay.tv_sec = 1; + delay.tv_nsec = 0; + + LOCK(&fop->lock); + + fop->jobs++; + fop->refs++; + + UNLOCK(&fop->lock); + + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, + ec_unlock_timer_cbk, link); + if (lock->timer == NULL) { + gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an unlock"); + + *lock->plock = NULL; + refs = 0; + } + + UNLOCK(&lock->loc.inode->lock); + + if (refs == 0) { + ec_unlock_now(fop, lock); + } + } +} + +void ec_unlock(ec_fop_data_t *fop) +{ + int32_t i; + + for (i = 0; i < fop->lock_count; i++) { + ec_unlock_timer_add(&fop->locks[i]); + } +} + void ec_flush_size_version(ec_fop_data_t * fop) { ec_lock_t * lock; @@ -1296,7 +1386,7 @@ void ec_flush_size_version(ec_fop_data_t * fop) if (version > 0) { - ec_update_size_version(fop, &lock->loc, version, delta); + ec_update_size_version(fop, &lock->loc, version, delta, NULL); } } @@ -1305,16 +1395,10 @@ void ec_lock_reuse(ec_fop_data_t *fop) ec_fop_data_t * wait_fop; ec_lock_t * lock; ec_lock_link_t * link; - uint64_t version = 0, delta = 0; - int32_t refs = 0; int32_t i; for (i = 0; i < fop->lock_count; i++) { - refs = 0; - delta = 0; - version = 0; - wait_fop = NULL; lock = fop->locks[i].lock; @@ -1338,14 +1422,6 @@ void ec_lock_reuse(ec_fop_data_t *fop) } } - version = lock->version_delta; - delta = lock->size_delta; - refs = lock->refs; - if (refs == 1) { - lock->version_delta = 0; - lock->size_delta = 0; - } - lock->good_mask &= fop->mask; if (!list_empty(&lock->waiting)) @@ -1371,10 +1447,6 @@ void ec_lock_reuse(ec_fop_data_t *fop) ec_resume(wait_fop, 0); } - - if ((refs == 1) && (version > 0)) { - ec_update_size_version(fop, &lock->loc, version, delta); - } } } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index ac197fe7f0b..97ec58469ad 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -142,6 +142,7 @@ union _ec_cbk struct _ec_lock { ec_lock_t **plock; + gf_timer_t *timer; struct list_head waiting; uintptr_t mask; uintptr_t good_mask; diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index 02961acb8bd..0bd10d4e27d 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -1016,11 +1016,41 @@ void ec_wind_mknod(ec_t * ec, ec_fop_data_t * fop, int32_t idx) int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) { + ec_t *ec; ec_cbk_data_t * cbk; switch (state) { case EC_STATE_INIT: + if (S_ISREG(fop->mode[0])) { + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + } + + ec = fop->xl->private; + + fop->config.version = EC_CONFIG_VERSION; + fop->config.algorithm = EC_CONFIG_ALGORITHM; + fop->config.gf_word_size = EC_GF_BITS; + fop->config.bricks = ec->nodes; + fop->config.redundancy = ec->redundancy; + fop->config.chunk_size = EC_METHOD_CHUNK_SIZE; + + if (ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, + &fop->config) < 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + } + + /* Fall through */ + case EC_STATE_LOCK: ec_lock_prepare_entry(fop, &fop->loc[0], 1); ec_lock(fop); diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 78a189bc325..c31f0d97674 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -81,7 +81,7 @@ int32_t ec_manager_access(ec_fop_data_t * fop, int32_t state) case EC_STATE_DISPATCH: ec_dispatch_one(fop); - return EC_STATE_PREPARE_ANSWER; + return EC_STATE_REPORT; case -EC_STATE_REPORT: if (fop->cbks.access != NULL) |