diff options
| author | Xavier Hernandez <xhernandez@datalab.es> | 2014-11-07 12:12:19 +0100 |
|---|---|---|
| committer | Raghavendra Bhat <raghavendra@redhat.com> | 2014-12-21 08:04:21 -0800 |
| commit | d2bde251a93968a03b7b49b3127f29ebdce73b13 (patch) | |
| tree | a96fd2a58a2f5824bcabef01da74eda60d66731e /xlators/cluster/ec/src/ec-common.c | |
| parent | 5c7afe9c1d6086ba1fb1d6e8d352da12a828ad8d (diff) | |
ec: Fix self-healing issues.
Three problems have been detected:
1. Self healing is executed in background, allowing the fop that
detected the problem to continue without blocks nor delays.
While this is quite interesting to avoid unnecessary delays,
it can cause spurious failures of self-heal because it may
try to recover a file inside a directory that a previous
self-heal has not recovered yet, causing the file self-heal
to fail.
2. When a partial self-heal is being executed on a directory,
if a full self-heal is attempted, it won't be executed
because another self-heal is already in process, so the
directory won't be fully repaired.
3. Information contained in loc's of some fop's is not enough
to do a complete self-heal.
To solve these problems, I've made some changes:
* Improved ec_loc_from_loc() to add all available information
to a loc.
* Before healing an entry, it's parent is checked and partially
healed if necessary to avoid failures.
* All heal requests received for the same inode while another
self-heal is being processed are queued. When the first heal
completes, all pending requests are answered using the results
of the first heal (without full execution), unless the first
heal was a partial heal. In this case all partial heals are
answered, and the first full heal is processed normally.
* An special virtual xattr (not physically stored on bricks)
named 'trusted.ec.heal' has been created to allow synchronous
self-heal of files.
Now, the recommended way to heal an entire volume is this:
find <mount> -d -exec getfattr -h -n trusted.ec.heal {} \;
Some minor changes:
* ec_loc_prepare() has been renamed to ec_loc_update().
* All loc management functions return 0 on success and -1 on
error.
* Do not delay fop unlocks if heal is needed.
* Added basic ec xattrs initially on create, mkdir and mknod
fops.
* Some coding style changes
This is a backport of http://review.gluster.org/9072/
Change-Id: I2a5fd9c57349a153710880d6ac4b1fa0c1475985
BUG: 1159484
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/9073
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-common.c')
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 94 |
1 files changed, 50 insertions, 44 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 7da9e25f2c8..88cc103e513 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -116,20 +116,28 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good, uintptr_t bad, dict_t * xdata) { - if (op_ret < 0) - { - gf_log(this->name, GF_LOG_WARNING, "Heal failed (error %d)", op_errno); - } - else - { - gf_log(this->name, GF_LOG_INFO, "Heal succeeded on %d/%d subvolumes", - ec_bits_count(mask & ~ (good | bad)), - ec_bits_count(mask & ~good)); + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "Heal failed (error %d)", + op_errno); + } else { + if ((mask & ~good) != 0) { + gf_log(this->name, GF_LOG_INFO, "Heal succeeded on %d/%d " + "subvolumes", + ec_bits_count(mask & ~(good | bad)), + ec_bits_count(mask & ~good)); + } } return 0; } +int32_t ec_fop_needs_heal(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + + return (ec->xl_up & ~(fop->remaining | fop->good)) != 0; +} + void ec_check_status(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; @@ -144,8 +152,7 @@ void ec_check_status(ec_fop_data_t * fop) } } - if ((ec->xl_up & ~(fop->remaining | fop->good)) == 0) - { + if (!ec_fop_needs_heal(fop)) { return; } @@ -157,19 +164,19 @@ void ec_check_status(ec_fop_data_t * fop) if (fop->use_fd) { if (fop->fd != NULL) { - ec_fheal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, - NULL, fop->fd, partial, NULL); + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + fop->fd, partial, NULL); } } else { - ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, &fop->loc[0], partial, NULL); if (fop->loc[1].inode != NULL) { - ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, - NULL, &fop->loc[1], partial, NULL); + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + &fop->loc[1], partial, NULL); } } } @@ -321,16 +328,12 @@ void ec_complete(ec_fop_data_t * fop) ec_trace("COMPLETE", fop, ""); - if (--fop->winds == 0) - { - if (fop->answer == NULL) - { - if (!list_empty(&fop->cbk_list)) - { + if (--fop->winds == 0) { + if (fop->answer == NULL) { + if (!list_empty(&fop->cbk_list)) { cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); if ((cbk->count >= fop->minimum) && - ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) - { + ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) { fop->answer = cbk; ec_update_bad(fop, cbk->mask); @@ -601,7 +604,7 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) lock->kind = kind; lock->good_mask = -1ULL; INIT_LIST_HEAD(&lock->waiting); - if (!ec_loc_from_loc(xl, &lock->loc, loc)) + if (ec_loc_from_loc(xl, &lock->loc, loc) != 0) { mem_put(lock); lock = NULL; @@ -666,7 +669,6 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) ec_inode_t * ctx = NULL; ec_lock_link_t *link = NULL; loc_t tmp; - int32_t error; if ((fop->parent != NULL) || (fop->error != 0)) { @@ -678,14 +680,13 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) */ if (update) { - error = ec_loc_parent(fop->xl, loc, &tmp); - if (error != 0) { - ec_fop_set_error(fop, error); + if (ec_loc_parent(fop->xl, loc, &tmp) != 0) { + ec_fop_set_error(fop, EIO); return; } } else { - if (!ec_loc_from_loc(fop->xl, &tmp, loc)) { + if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) { ec_fop_set_error(fop, EIO); return; @@ -806,7 +807,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) return; } - if (ec_loc_from_fd(fop->xl, &loc, fd)) + if (ec_loc_from_fd(fop->xl, &loc, fd) == 0) { ec_lock_prepare_inode(fop, &loc, update); @@ -1075,7 +1076,7 @@ void ec_get_size_version(ec_fop_data_t * fop) if (!fop->use_fd) { - if (!ec_loc_from_loc(fop->xl, &loc, &fop->loc[0])) + if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) { goto out; } @@ -1090,9 +1091,7 @@ void ec_get_size_version(ec_fop_data_t * fop) loc.path = NULL; loc.name = NULL; } - } - else if (!ec_loc_from_fd(fop->xl, &loc, fop->fd)) - { + } else if (ec_loc_from_fd(fop->xl, &loc, fop->fd) != 0) { goto out; } @@ -1318,9 +1317,6 @@ void ec_unlock_timer_add(ec_lock_link_t *link) UNLOCK(&lock->loc.inode->lock); } else if (lock->acquired) { - delay.tv_sec = 1; - delay.tv_nsec = 0; - LOCK(&fop->lock); fop->jobs++; @@ -1328,15 +1324,25 @@ void ec_unlock_timer_add(ec_lock_link_t *link) UNLOCK(&fop->lock); - lock->timer = gf_timer_call_after(fop->xl->ctx, delay, - ec_unlock_timer_cbk, link); - if (lock->timer == NULL) { - gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an unlock"); + /* If healing is needed, do not delay lock release to let self-heal + * start working as soon as possible. */ + if (!ec_fop_needs_heal(fop)) { + ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); + + delay.tv_sec = 1; + delay.tv_nsec = 0; + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, + ec_unlock_timer_cbk, link); + if (lock->timer == NULL) { + gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an " + "unlock"); + *lock->plock = NULL; + refs = 0; + } + } else { *lock->plock = NULL; refs = 0; - } else { - ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); } UNLOCK(&lock->loc.inode->lock); |
