diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2014-10-08 09:20:11 +0200 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-10-21 11:38:45 -0700 |
commit | 56caf4349c8824fde70783fe404cc6f646dce149 (patch) | |
tree | 02bef10cd75c1422c3ed272d5ecd01edcb404033 /xlators/cluster/ec/src/ec-common.c | |
parent | d57ecca6322a451242f4a2b7b5978de7c8f9088e (diff) |
ec: Fix self-heal issues
Problem: Doing an 'ls' of a directory that has been modified while one
of the bricks was down, sometimes returns the old directory
contents.
Cause: Directories are not marked when they are modified as files are.
The ec xlator balances requests amongst available and healthy
bricks. Since there is no way to detect that a directory is
out of date in one of the bricks, it is used from time to time
to return the directory contents.
Solution: Basically the solution consists in use versioning information
also for directories, however some additional changes have
been necessary.
Changes:
* Use directory versioning:
This required to lock full directory instead of a single entry for
all requests that add or remove entries from it. This is needed to
allow atomic version update. This affects the following fops:
create, mkdir, mknod, link, symlink, rename, unlink, rmdir
Another side effect is that opendir requires to do a previous
lookup to get versioning information and discard out of date
bricks for subsequent readdir(p) calls.
* Restrict directory self-heal:
Till now, when one discrepancy was found in lookup, a self-heal
was automatically started. This caused the versioning information
of a bad directory to be healed instantly, making the original
problem to reapear again.
To solve this, when a missing directory is detected in one or more
bricks on lookup or opendir fops, only a partial self-heal is
performed on it. A partial self-heal basically creates the
directory but does not restore any additional information.
This avoids that an 'ls' could repair the directory and cause the
problem to happen again. With this change, output of 'ls' is
always consistent. However, since the directory has been created
in the brick, this allows any other operation on it (create new
files, for example) to succeed on all bricks and not add additional
work to the self-heal process.
To force a self-heal of a directory, any other operation must be
done on it. For example a getxattr.
With these changes, the correct healing procedure that would avoid
inconsistent directory browsing consists on a post-order traversal
of directoriesi being healed. This way, the directory contents will
be healed before healing the directory itslef.
* Additional changes to fix self-heal errors
- Don't use fop->fd to decide between fd/loc.
open, opendir and create have an fd, but the correct data is in
loc.
- Fix incorrect management of bad bricks per inode/fd.
- Fix incorrect selection of fop's target bricks when there are bad
bricks involved.
- Improved ec_loc_parent() to always return a parent loc as
complete as possible.
Change-Id: Iaf3df174d7857da57d4a87b4a8740a7048b366ad
BUG: 1149726
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/8916
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-common.c')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 271 |
1 files changed, 131 insertions, 140 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 0ba7bce7728..561871cee93 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -71,7 +71,7 @@ uintptr_t ec_fd_good(fd_t * fd, xlator_t * xl) uintptr_t bad = 0; ctx = ec_fd_get(fd, xl); - if ((ctx != NULL) && (ctx->loc.inode != NULL)) + if (ctx != NULL) { bad = ctx->bad; } @@ -110,7 +110,7 @@ uintptr_t ec_update_fd(ec_fop_data_t * fop, fd_t * fd, uintptr_t good, LOCK(&fd->lock); ctx = __ec_fd_get(fd, fop->xl); - if ((ctx != NULL) && (ctx->loc.inode != NULL)) + if (ctx != NULL) { ctx->bad &= ~good; bad |= ctx->bad; @@ -143,6 +143,15 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, void ec_check_status(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; + int32_t partial = 0; + + if (fop->answer->op_ret >= 0) { + if (fop->id == GF_FOP_LOOKUP) { + partial = fop->answer->iatt[0].ia_type == IA_IFDIR; + } else if (fop->id == GF_FOP_OPENDIR) { + partial = 1; + } + } if ((ec->xl_up & ~(fop->remaining | fop->good)) == 0) { @@ -154,42 +163,36 @@ void ec_check_status(ec_fop_data_t * fop) "remaining=%lX, good=%lX, bad=%lX)", ec->xl_up, fop->mask, fop->remaining, fop->good, fop->bad); - if (fop->fd != NULL) + if (fop->use_fd) { - ec_fheal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, - fop->fd, NULL); + if (fop->fd != NULL) { + ec_fheal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, + NULL, fop->fd, partial, NULL); + } } else { ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, - &fop->loc[0], NULL); + &fop->loc[0], partial, NULL); if (fop->loc[1].inode != NULL) { ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, - NULL, &fop->loc[1], NULL); + NULL, &fop->loc[1], partial, NULL); } } } void ec_update_bad(ec_fop_data_t * fop, uintptr_t good) { + ec_t *ec = fop->xl->private; uintptr_t bad; - int32_t update = 0; - bad = fop->mask & ~(fop->remaining | good); - if ((fop->bad & bad) != bad) - { - fop->bad |= bad; - update = 1; - } - if ((fop->good & good) != good) - { - fop->good |= good; - update = 1; - } + bad = ec->xl_up & ~(fop->remaining | good); + fop->bad |= bad; + fop->good |= good; - if (update && (fop->parent == NULL)) + if (fop->parent == NULL) { if ((fop->flags & EC_FLAG_UPDATE_LOC_PARENT) != 0) { @@ -369,21 +372,20 @@ int32_t ec_child_select(ec_fop_data_t * fop) fop->mask &= ec->node_mask; mask = ec->xl_up; - if (fop->loc[0].inode != NULL) - { - mask |= ec_inode_good(fop->loc[0].inode, fop->xl); - } - if (fop->loc[1].inode != NULL) - { - mask |= ec_inode_good(fop->loc[1].inode, fop->xl); - } - if (fop->fd != NULL) + if (fop->parent == NULL) { - if (fop->fd->inode != NULL) - { - mask |= ec_inode_good(fop->fd->inode, fop->xl); + if (fop->loc[0].inode != NULL) { + mask &= ec_inode_good(fop->loc[0].inode, fop->xl); + } + if (fop->loc[1].inode != NULL) { + mask &= ec_inode_good(fop->loc[1].inode, fop->xl); + } + if (fop->fd != NULL) { + if (fop->fd->inode != NULL) { + mask &= ec_inode_good(fop->fd->inode, fop->xl); + } + mask &= ec_fd_good(fop->fd, fop->xl); } - mask |= ec_fd_good(fop->fd, fop->xl); } if ((fop->mask & ~mask) != 0) { @@ -619,7 +621,6 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) void ec_lock_destroy(ec_lock_t * lock) { - GF_FREE(lock->basename); loc_wipe(&lock->loc); mem_put(lock); @@ -627,31 +628,13 @@ void ec_lock_destroy(ec_lock_t * lock) int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2) { - int32_t res; - - res = uuid_compare(lock1->loc.gfid, lock2->loc.gfid); - if (res != 0) - { - return res; - } - if (lock1->basename == NULL) - { - if (lock2->basename == NULL) - { - return 0; - } - return 1; - } - if (lock2->basename == NULL) - { - return -1; - } - return strcmp(lock1->basename, lock2->basename); + return uuid_compare(lock1->loc.gfid, lock2->loc.gfid); } -void ec_lock_insert(ec_fop_data_t * fop, ec_lock_t * lock) +void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, int32_t update) { ec_lock_t * tmp; + int32_t tmp_update; if ((fop->lock_count > 0) && (ec_lock_compare(fop->locks[0].lock, lock) > 0)) @@ -659,19 +642,25 @@ void ec_lock_insert(ec_fop_data_t * fop, ec_lock_t * lock) tmp = fop->locks[0].lock; fop->locks[0].lock = lock; lock = tmp; + + tmp_update = fop->locks_update; + fop->locks_update = update; + update = tmp_update; } fop->locks[fop->lock_count].lock = lock; fop->locks[fop->lock_count].fop = fop; + + fop->locks_update |= update << fop->lock_count; + fop->lock_count++; lock->refs++; } -void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc) +void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) { ec_lock_t * lock = NULL; ec_inode_t * ctx = NULL; - char * name = NULL; loc_t tmp; int32_t error; @@ -680,12 +669,23 @@ void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc) return; } - error = ec_loc_parent(fop->xl, loc, &tmp, &name); - if (error != 0) + /* update is only 0 for 'opendir', which needs to lock the entry pointed + * by loc instead of its parent. + */ + if (update) { - ec_fop_set_error(fop, error); + error = ec_loc_parent(fop->xl, loc, &tmp); + if (error != 0) { + ec_fop_set_error(fop, error); - return; + return; + } + } else { + if (!ec_loc_from_loc(fop->xl, &tmp, loc)) { + ec_fop_set_error(fop, EIO); + + return; + } } LOCK(&tmp.inode->lock); @@ -698,16 +698,14 @@ void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc) goto unlock; } - list_for_each_entry(lock, &ctx->entry_locks, list) + if (ctx->entry_lock != NULL) { - if (strcmp(lock->basename, name) == 0) - { - ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s, " - "name=%s. Lock already acquired", - lock, tmp.inode, tmp.path, name); + lock = ctx->entry_lock; + ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s" + "Lock already acquired", + lock, tmp.inode, tmp.path); - goto insert; - } + goto insert; } lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp); @@ -721,22 +719,20 @@ void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc) ec_trace("LOCK_CREATE", fop, "lock=%p", lock); lock->type = ENTRYLK_WRLCK; - lock->basename = name; - name = NULL; - list_add_tail(&lock->list, &ctx->entry_locks); + lock->plock = &ctx->entry_lock; + ctx->entry_lock = lock; insert: - ec_lock_insert(fop, lock); + ec_lock_insert(fop, lock, update); unlock: UNLOCK(&tmp.inode->lock); loc_wipe(&tmp); - GF_FREE(name); } -void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc) +void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, int32_t update) { ec_lock_t * lock; ec_inode_t * ctx; @@ -756,9 +752,9 @@ void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc) goto unlock; } - if (!list_empty(&ctx->inode_locks)) + if (ctx->inode_lock != NULL) { - lock = list_entry(ctx->inode_locks.next, ec_lock_t, list); + lock = ctx->inode_lock; ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already " "acquired", lock, loc->inode); @@ -778,16 +774,17 @@ void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc) lock->flock.l_type = F_WRLCK; lock->flock.l_whence = SEEK_SET; - list_add_tail(&lock->list, &ctx->inode_locks); + lock->plock = &ctx->inode_lock; + ctx->inode_lock = lock; insert: - ec_lock_insert(fop, lock); + ec_lock_insert(fop, lock, update); unlock: UNLOCK(&loc->inode->lock); } -void ec_lock_prepare_fd(ec_fop_data_t * fop, fd_t * fd) +void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) { loc_t loc; @@ -798,7 +795,7 @@ void ec_lock_prepare_fd(ec_fop_data_t * fop, fd_t * fd) if (ec_loc_from_fd(fop->xl, &loc, fd)) { - ec_lock_prepare_inode(fop, &loc); + ec_lock_prepare_inode(fop, &loc, update); loc_wipe(&loc); } @@ -868,12 +865,11 @@ void ec_lock(ec_fop_data_t * fop) if (lock->kind == EC_LOCK_ENTRY) { - ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s, " - "name=%s", lock, lock->loc.inode, lock->loc.path, - lock->basename); + ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s", + lock, lock->loc.inode, lock->loc.path); ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, - lock, fop->xl->name, &lock->loc, lock->basename, + lock, fop->xl->name, &lock->loc, NULL, ENTRYLK_LOCK, lock->type, NULL); } else @@ -936,7 +932,7 @@ void ec_unlock(ec_fop_data_t * fop) refs = --lock->refs; if (refs == 0) { - list_del_init(&lock->list); + *lock->plock = NULL; } UNLOCK(&lock->loc.inode->lock); @@ -951,13 +947,12 @@ void ec_unlock(ec_fop_data_t * fop) { case EC_LOCK_ENTRY: ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, " - "path=%s, basename=%s", - lock, lock->loc.inode, lock->loc.path, - lock->basename); + "path=%s", + lock, lock->loc.inode, lock->loc.path); ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, ec_unlocked, lock, - fop->xl->name, &lock->loc, lock->basename, + fop->xl->name, &lock->loc, NULL, ENTRYLK_UNLOCK, lock->type, NULL); break; @@ -1061,17 +1056,23 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, } } - LOCK(&inode->lock); ctx = __ec_inode_get(inode, this); - if ((ctx != NULL) && !list_empty(&ctx->inode_locks)) - { - lock = list_entry(ctx->inode_locks.next, ec_lock_t, list); - - lock->have_size = 1; - lock->size = buf->ia_size; - lock->version = fop->answer->version; + if (ctx != NULL) { + if (ctx->inode_lock != NULL) { + lock = ctx->inode_lock; + lock->version = fop->answer->version; + + if (buf->ia_type == IA_IFREG) { + lock->have_size = 1; + lock->size = buf->ia_size; + } + } + if (ctx->entry_lock != NULL) { + lock = ctx->entry_lock; + lock->version = fop->answer->version; + } } UNLOCK(&inode->lock); @@ -1083,10 +1084,10 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, fop->parent->mask &= fop->good; } - fop->parent->pre_size = fop->parent->post_size = buf->ia_size; - - fop->parent->have_size = 1; - + if (buf->ia_type == IA_IFREG) { + fop->parent->pre_size = fop->parent->post_size = buf->ia_size; + fop->parent->have_size = 1; + } } else { @@ -1143,7 +1144,7 @@ void ec_get_size_version(ec_fop_data_t * fop) error = EIO; - if (fop->fd == NULL) + if (!fop->use_fd) { if (!ec_loc_from_loc(fop->xl, &loc, &fop->loc[0])) { @@ -1211,7 +1212,7 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie, return 0; } -void ec_update_size_version(ec_fop_data_t * fop, uint64_t version, +void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version, uint64_t size) { dict_t * dict; @@ -1251,18 +1252,9 @@ void ec_update_size_version(ec_fop_data_t * fop, uint64_t version, fop->frame->root->uid = 0; fop->frame->root->gid = 0; - if (fop->fd == NULL) - { - ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, - ec_update_size_version_done, NULL, &fop->loc[0], - GF_XATTROP_ADD_ARRAY64, dict, NULL); - } - else - { - ec_fxattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, - ec_update_size_version_done, NULL, fop->fd, - GF_XATTROP_ADD_ARRAY64, dict, NULL); - } + ec_xattrop(fop->frame, fop->xl, fop->mask, EC_MINIMUM_MIN, + ec_update_size_version_done, NULL, loc, + GF_XATTROP_ADD_ARRAY64, dict, NULL); fop->frame->root->uid = uid; fop->frame->root->gid = gid; @@ -1291,8 +1283,6 @@ void ec_flush_size_version(ec_fop_data_t * fop) lock = fop->locks[0].lock; - GF_ASSERT(lock->kind == EC_LOCK_INODE); - LOCK(&lock->loc.inode->lock); GF_ASSERT(lock->owner == fop); @@ -1306,11 +1296,11 @@ void ec_flush_size_version(ec_fop_data_t * fop) if (version > 0) { - ec_update_size_version(fop, version, delta); + ec_update_size_version(fop, &lock->loc, version, delta); } } -void ec_lock_reuse(ec_fop_data_t * fop, int32_t update) +void ec_lock_reuse(ec_fop_data_t *fop) { ec_fop_data_t * wait_fop; ec_lock_t * lock; @@ -1321,6 +1311,10 @@ void ec_lock_reuse(ec_fop_data_t * fop, int32_t update) for (i = 0; i < fop->lock_count; i++) { + refs = 0; + delta = 0; + version = 0; + wait_fop = NULL; lock = fop->locks[i].lock; @@ -1332,28 +1326,26 @@ void ec_lock_reuse(ec_fop_data_t * fop, int32_t update) GF_ASSERT(lock->owner == fop); lock->owner = NULL; - if (lock->kind == EC_LOCK_INODE) - { - if (update && (fop->error == 0)) + if (((fop->locks_update >> i) & 1) != 0) { + if (fop->error == 0) { lock->version_delta++; lock->size_delta += fop->post_size - fop->pre_size; + if (fop->have_size) { + lock->size = fop->post_size; + lock->have_size = 1; + } } - version = lock->version_delta; - delta = lock->size_delta; - refs = lock->refs; - if (refs == 1) - { - lock->version_delta = 0; - lock->size_delta = 0; - } + } - if (fop->have_size) - { - lock->size = fop->post_size; - lock->have_size = 1; - } + version = lock->version_delta; + delta = lock->size_delta; + refs = lock->refs; + if (refs == 1) { + lock->version_delta = 0; + lock->size_delta = 0; } + lock->good_mask &= fop->mask; if (!list_empty(&lock->waiting)) @@ -1379,11 +1371,10 @@ void ec_lock_reuse(ec_fop_data_t * fop, int32_t update) ec_resume(wait_fop, 0); } - } - if ((refs == 1) && (version > 0)) - { - ec_update_size_version(fop, version, delta); + if ((refs == 1) && (version > 0)) { + ec_update_size_version(fop, &lock->loc, version, delta); + } } } |