diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2014-11-07 12:12:19 +0100 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-12-04 11:34:38 -0800 |
commit | bc91dd4de39ffd481a52b837f322f6782c14e9f1 (patch) | |
tree | a7a550c659bdee92b5d29839f87f5473d0b367bc /xlators | |
parent | 7319b01ffa3d4ff7b1405873c8caeaf8a1f7b5d6 (diff) |
ec: Fix self-healing issues.
Three problems have been detected:
1. Self healing is executed in background, allowing the fop that
detected the problem to continue without blocks nor delays.
While this is quite interesting to avoid unnecessary delays,
it can cause spurious failures of self-heal because it may
try to recover a file inside a directory that a previous
self-heal has not recovered yet, causing the file self-heal
to fail.
2. When a partial self-heal is being executed on a directory,
if a full self-heal is attempted, it won't be executed
because another self-heal is already in process, so the
directory won't be fully repaired.
3. Information contained in loc's of some fop's is not enough
to do a complete self-heal.
To solve these problems, I've made some changes:
* Improved ec_loc_from_loc() to add all available information
to a loc.
* Before healing an entry, it's parent is checked and partially
healed if necessary to avoid failures.
* All heal requests received for the same inode while another
self-heal is being processed are queued. When the first heal
completes, all pending requests are answered using the results
of the first heal (without full execution), unless the first
heal was a partial heal. In this case all partial heals are
answered, and the first full heal is processed normally.
* An special virtual xattr (not physically stored on bricks)
named 'trusted.ec.heal' has been created to allow synchronous
self-heal of files.
Now, the recommended way to heal an entire volume is this:
find <mount> -d -exec getfattr -h -n trusted.ec.heal {} \;
Some minor changes:
* ec_loc_prepare() has been renamed to ec_loc_update().
* All loc management functions return 0 on success and -1 on
error.
* Do not delay fop unlocks if heal is needed.
* Added basic ec xattrs initially on create, mkdir and mknod
fops.
* Some coding style changes
Change-Id: I2a5fd9c57349a153710880d6ac4b1fa0c1475985
BUG: 1161588
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/9072
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 96 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 50 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-read.c | 4 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 150 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 7 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 182 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 271 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 4 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 96 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.h | 1 |
10 files changed, 548 insertions, 313 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 2d69ac0f384..894d2f552f3 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -116,20 +116,28 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good, uintptr_t bad, dict_t * xdata) { - if (op_ret < 0) - { - gf_log(this->name, GF_LOG_WARNING, "Heal failed (error %d)", op_errno); - } - else - { - gf_log(this->name, GF_LOG_INFO, "Heal succeeded on %d/%d subvolumes", - ec_bits_count(mask & ~ (good | bad)), - ec_bits_count(mask & ~good)); + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "Heal failed (error %d)", + op_errno); + } else { + if ((mask & ~good) != 0) { + gf_log(this->name, GF_LOG_INFO, "Heal succeeded on %d/%d " + "subvolumes", + ec_bits_count(mask & ~(good | bad)), + ec_bits_count(mask & ~good)); + } } return 0; } +int32_t ec_fop_needs_heal(ec_fop_data_t *fop) +{ + ec_t *ec = fop->xl->private; + + return (ec->xl_up & ~(fop->remaining | fop->good)) != 0; +} + void ec_check_status(ec_fop_data_t * fop) { ec_t * ec = fop->xl->private; @@ -144,8 +152,7 @@ void ec_check_status(ec_fop_data_t * fop) } } - if ((ec->xl_up & ~(fop->remaining | fop->good)) == 0) - { + if (!ec_fop_needs_heal(fop)) { return; } @@ -157,19 +164,19 @@ void ec_check_status(ec_fop_data_t * fop) if (fop->use_fd) { if (fop->fd != NULL) { - ec_fheal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, - NULL, fop->fd, partial, NULL); + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + fop->fd, partial, NULL); } } else { - ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, &fop->loc[0], partial, NULL); if (fop->loc[1].inode != NULL) { - ec_heal(fop->frame, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, - NULL, &fop->loc[1], partial, NULL); + ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, + &fop->loc[1], partial, NULL); } } } @@ -320,16 +327,12 @@ void ec_complete(ec_fop_data_t * fop) ec_trace("COMPLETE", fop, ""); - if (--fop->winds == 0) - { - if (fop->answer == NULL) - { - if (!list_empty(&fop->cbk_list)) - { + if (--fop->winds == 0) { + if (fop->answer == NULL) { + if (!list_empty(&fop->cbk_list)) { cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); if ((cbk->count >= fop->minimum) && - ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) - { + ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) { fop->answer = cbk; ec_update_bad(fop, cbk->mask); @@ -600,7 +603,7 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc) lock->kind = kind; lock->good_mask = -1ULL; INIT_LIST_HEAD(&lock->waiting); - if (!ec_loc_from_loc(xl, &lock->loc, loc)) + if (ec_loc_from_loc(xl, &lock->loc, loc) != 0) { mem_put(lock); lock = NULL; @@ -665,7 +668,6 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) ec_inode_t * ctx = NULL; ec_lock_link_t *link = NULL; loc_t tmp; - int32_t error; if ((fop->parent != NULL) || (fop->error != 0)) { @@ -677,14 +679,13 @@ void ec_lock_prepare_entry(ec_fop_data_t *fop, loc_t *loc, int32_t update) */ if (update) { - error = ec_loc_parent(fop->xl, loc, &tmp); - if (error != 0) { - ec_fop_set_error(fop, error); + if (ec_loc_parent(fop->xl, loc, &tmp) != 0) { + ec_fop_set_error(fop, EIO); return; } } else { - if (!ec_loc_from_loc(fop->xl, &tmp, loc)) { + if (ec_loc_from_loc(fop->xl, &tmp, loc) != 0) { ec_fop_set_error(fop, EIO); return; @@ -805,7 +806,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update) return; } - if (ec_loc_from_fd(fop->xl, &loc, fd)) + if (ec_loc_from_fd(fop->xl, &loc, fd) == 0) { ec_lock_prepare_inode(fop, &loc, update); @@ -1074,7 +1075,7 @@ void ec_get_size_version(ec_fop_data_t * fop) if (!fop->use_fd) { - if (!ec_loc_from_loc(fop->xl, &loc, &fop->loc[0])) + if (ec_loc_from_loc(fop->xl, &loc, &fop->loc[0]) != 0) { goto out; } @@ -1089,9 +1090,7 @@ void ec_get_size_version(ec_fop_data_t * fop) loc.path = NULL; loc.name = NULL; } - } - else if (!ec_loc_from_fd(fop->xl, &loc, fop->fd)) - { + } else if (ec_loc_from_fd(fop->xl, &loc, fop->fd) != 0) { goto out; } @@ -1317,11 +1316,6 @@ void ec_unlock_timer_add(ec_lock_link_t *link) UNLOCK(&lock->loc.inode->lock); } else { - ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); - - delay.tv_sec = 1; - delay.tv_nsec = 0; - LOCK(&fop->lock); fop->jobs++; @@ -1329,11 +1323,23 @@ void ec_unlock_timer_add(ec_lock_link_t *link) UNLOCK(&fop->lock); - lock->timer = gf_timer_call_after(fop->xl->ctx, delay, - ec_unlock_timer_cbk, link); - if (lock->timer == NULL) { - gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an unlock"); - + /* If healing is needed, do not delay lock release to let self-heal + * start working as soon as possible. */ + if (!ec_fop_needs_heal(fop)) { + ec_trace("UNLOCK_DELAY", fop, "lock=%p", lock); + + delay.tv_sec = 1; + delay.tv_nsec = 0; + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, + ec_unlock_timer_cbk, link); + if (lock->timer == NULL) { + gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to delay an " + "unlock"); + + *lock->plock = NULL; + refs = 0; + } + } else { *lock->plock = NULL; refs = 0; } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index d22a20090df..35c84254550 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -66,10 +66,11 @@ struct _ec_fd struct _ec_inode { - uintptr_t bad; - ec_lock_t *entry_lock; - ec_lock_t *inode_lock; - ec_heal_t *heal; + uintptr_t bad; + ec_lock_t *entry_lock; + ec_lock_t *inode_lock; + struct list_head heal; + }; typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, @@ -199,6 +200,7 @@ struct _ec_fop_data ec_resume_f resume; ec_cbk_t cbks; void * data; + ec_heal_t *heal; uint64_t user_size; uint32_t head; @@ -255,25 +257,27 @@ struct _ec_cbk_data struct _ec_heal { - gf_lock_t lock; - xlator_t * xl; - ec_fop_data_t * fop; - ec_fop_data_t * lookup; - loc_t loc; - struct iatt iatt; - char * symlink; - fd_t * fd; - int32_t partial; - int32_t done; - uintptr_t available; - uintptr_t good; - uintptr_t bad; - uintptr_t open; - uintptr_t fixed; - uint64_t offset; - uint64_t size; - uint64_t version; - uint64_t raw_size; + struct list_head list; + gf_lock_t lock; + xlator_t *xl; + ec_fop_data_t *fop; + void *data; + ec_fop_data_t *lookup; + loc_t loc; + struct iatt iatt; + char *symlink; + fd_t *fd; + int32_t partial; + int32_t done; + uintptr_t available; + uintptr_t good; + uintptr_t bad; + uintptr_t open; + uintptr_t fixed; + uint64_t offset; + uint64_t size; + uint64_t version; + uint64_t raw_size; }; ec_cbk_data_t * ec_cbk_data_allocate(call_frame_t * frame, xlator_t * this, diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c index 0e91d5f416e..95d80efdf8b 100644 --- a/xlators/cluster/ec/src/ec-dir-read.c +++ b/xlators/cluster/ec/src/ec-dir-read.c @@ -114,8 +114,8 @@ int32_t ec_manager_opendir(ec_fop_data_t * fop, int32_t state) LOCK(&fop->fd->lock); ctx = __ec_fd_get(fop->fd, fop->xl); - if ((ctx == NULL) || !ec_loc_from_loc(fop->xl, &ctx->loc, - &fop->loc[0])) { + if ((ctx == NULL) || + (ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0])) != 0) { UNLOCK(&fop->fd->lock); fop->error = EIO; diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index 3d8055c40a7..2b1064a98f0 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -149,9 +149,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) LOCK(&fop->fd->lock); ctx = __ec_fd_get(fop->fd, fop->xl); - if ((ctx == NULL) || !ec_loc_from_loc(fop->xl, &ctx->loc, - &fop->loc[0])) - { + if ((ctx == NULL) || + (ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0])) != 0) { UNLOCK(&fop->fd->lock); fop->error = EIO; @@ -194,6 +193,18 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) return EC_STATE_REPORT; } + if (ec_dict_set_number(fop->xdata, EC_XATTR_VERSION, 0) != 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + + if (ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0) != 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + fop->int32 &= ~O_ACCMODE; fop->int32 |= O_RDWR; @@ -222,27 +233,27 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { + if (cbk->op_ret >= 0) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, - &cbk->iatt[0]); + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } else { + LOCK(&fop->fd->lock); - LOCK(&fop->fd->lock); + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + ctx->open |= cbk->mask; + } - ctx = __ec_fd_get(fop->fd, fop->xl); - if (ctx != NULL) - { - ctx->open |= cbk->mask; + UNLOCK(&fop->fd->lock); } - - UNLOCK(&fop->fd->lock); + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else @@ -511,22 +522,21 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { + if (cbk->op_ret >= 0) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); - - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, - &cbk->iatt[0]); - - if (cbk->iatt[0].ia_type == IA_IFREG) - { + if (cbk->iatt[0].ia_type == IA_IFREG) { cbk->iatt[0].ia_size = fop->pre_size; } + + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else @@ -754,6 +764,23 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state) switch (state) { case EC_STATE_INIT: + if (fop->xdata == NULL) { + fop->xdata = dict_new(); + if (fop->xdata == NULL) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + } + + if (ec_dict_set_number(fop->xdata, EC_XATTR_VERSION, 0) != 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + + /* Fall through */ + case EC_STATE_LOCK: ec_lock_prepare_entry(fop, &fop->loc[0], 1); ec_lock(fop); @@ -777,17 +804,18 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { + if (cbk->op_ret >= 0) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, - &cbk->iatt[0]); + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else @@ -1037,6 +1065,18 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) return EC_STATE_REPORT; } + + if (ec_dict_set_number(fop->xdata, EC_XATTR_VERSION, 0) != 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } + + if (ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0) != 0) { + fop->error = EIO; + + return EC_STATE_REPORT; + } } /* Fall through */ @@ -1064,17 +1104,18 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { + if (cbk->op_ret >= 0) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, - &cbk->iatt[0]); + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else @@ -1822,17 +1863,18 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { + if (cbk->op_ret >= 0) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count); - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, - &cbk->iatt[0]); + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, + &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 63edf7da132..ffc40f01de1 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -711,7 +711,12 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk) ec_dict_del_number(cbk->xdata, EC_XATTR_VERSION, &cbk->version); - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]); + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + + return; + } LOCK(&cbk->inode->lock); diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 042f24e7d4a..da5f5947de3 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -62,8 +62,8 @@ void ec_heal_lookup_resume(ec_fop_data_t * fop) heal->fop->post_size = cbk->iatt[0].ia_size; heal->fop->have_size = 1; - if (!ec_loc_prepare(heal->xl, &heal->loc, cbk->inode, - &cbk->iatt[0])) + if (ec_loc_update(heal->xl, &heal->loc, cbk->inode, + &cbk->iatt[0]) != 0) { fop->answer = NULL; fop->error = EIO; @@ -383,6 +383,41 @@ int32_t ec_heal_create(ec_heal_t * heal, uintptr_t mask, int32_t try_link) return 0; } +int32_t ec_heal_parent_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, + int32_t op_ret, int32_t op_errno, uintptr_t mask, + uintptr_t good, uintptr_t bad, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + /* Even if parent self-heal has failed, we try to heal the current entry */ + ec_heal_create(heal, fop->mask, 0); + + return 0; +} + +void ec_heal_parent(ec_heal_t *heal, uintptr_t mask) +{ + loc_t parent; + int32_t healing = 0; + + /* First we try to do a partial heal of the parent directory to avoid + * ENOENT/ENOTDIR errors caused by missing parents */ + if (ec_loc_parent(heal->xl, &heal->loc, &parent) == 0) { + if (!__is_root_gfid(parent.gfid)) { + ec_heal(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE, + ec_heal_parent_cbk, heal, &parent, 1, NULL); + + healing = 1; + } + loc_wipe(&parent); + } + + if (!healing) { + ec_heal_create(heal, mask, 0); + } +} + void ec_heal_recreate(ec_fop_data_t * fop) { ec_cbk_data_t * cbk; @@ -405,7 +440,7 @@ void ec_heal_recreate(ec_fop_data_t * fop) if (mask != 0) { - ec_heal_create(heal, mask, 0); + ec_heal_parent(heal, mask); } } @@ -458,8 +493,7 @@ int32_t ec_heal_init(ec_fop_data_t * fop) memset(heal, 0, sizeof(ec_heal_t)); - if (!ec_loc_from_loc(fop->xl, &heal->loc, &fop->loc[0])) - { + if (ec_loc_from_loc(fop->xl, &heal->loc, &fop->loc[0]) != 0) { error = ENOMEM; goto out; @@ -472,6 +506,7 @@ int32_t ec_heal_init(ec_fop_data_t * fop) pool = fop->xl->ctx->iobuf_pool; heal->size = iobpool_default_pagesize(pool) * ec->fragments; heal->partial = fop->int32; + fop->heal = heal; LOCK(&inode->lock); @@ -483,20 +518,30 @@ int32_t ec_heal_init(ec_fop_data_t * fop) goto unlock; } - if (ctx->heal != NULL) - { + if (list_empty(&ctx->heal)) { + gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path, + uuid_utoa(heal->loc.gfid)); + } else { error = EEXIST; - - goto unlock; } - fop->data = heal; - - ctx->heal = heal; + list_add_tail(&heal->list, &ctx->heal); heal = NULL; unlock: UNLOCK(&inode->lock); + + if (error == EEXIST) { + LOCK(&fop->lock); + + fop->jobs++; + fop->refs++; + + UNLOCK(&fop->lock); + + error = 0; + } + out: GF_FREE(heal); @@ -506,12 +551,9 @@ out: void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd) { loc_t loc; - int32_t error; - error = ec_loc_parent(heal->xl, &heal->loc, &loc); - if (error != 0) - { - ec_fop_set_error(heal->fop, error); + if (ec_loc_parent(heal->xl, &heal->loc, &loc) != 0) { + ec_fop_set_error(heal->fop, EIO); return; } @@ -605,7 +647,8 @@ void ec_heal_remove_others(ec_heal_t * heal) if (cbk->op_ret < 0) { - if ((cbk->op_errno != ENOENT) && (cbk->op_errno != ENOTDIR)) + if ((cbk->op_errno != ENOENT) && (cbk->op_errno != ENOTDIR) && + (cbk->op_errno != ESTALE)) { gf_log(heal->xl->name, GF_LOG_WARNING, "Don't know how to " "remove inode with " @@ -635,7 +678,7 @@ void ec_heal_prepare_others(ec_heal_t * heal) if (cbk->op_ret < 0) { - if (cbk->op_errno == ENOENT) + if ((cbk->op_errno == ENOENT) || (cbk->op_errno == ESTALE)) { ec_heal_create(heal, cbk->mask, 1); } @@ -1061,35 +1104,61 @@ void ec_heal_data(ec_heal_t * heal) } } -void ec_heal_dispatch(ec_heal_t * heal) +void ec_heal_dispatch(ec_heal_t *heal) { - ec_fop_data_t * fop = heal->fop; - ec_cbk_data_t * cbk; - inode_t * inode; - ec_inode_t * ctx; + ec_fop_data_t *fop; + ec_cbk_data_t *cbk; + inode_t *inode; + ec_inode_t *ctx; + ec_heal_t *next = NULL; + struct list_head list; int32_t error; inode = heal->loc.inode; + INIT_LIST_HEAD(&list); + LOCK(&inode->lock); - ctx = __ec_inode_get(inode, heal->xl); - if (ctx != NULL) - { - ctx->bad &= ~heal->good; - ctx->heal = NULL; - } + /* A heal object not belonging to any list means that it has not been fully + * executed. It got its information from a previous heal that was executing + * when this heal started. */ + if (!list_empty(&heal->list)) { + list_del_init(&heal->list); + ctx = __ec_inode_get(inode, heal->xl); + if (ctx != NULL) { + ctx->bad &= ~heal->good; - fop->data = NULL; + if (heal->partial) { + /* Collect all partial heal requests. All of them will receive + * the same answer. 'next' will contain a pointer to the first + * full request (if any) after this partial heal request.*/ + while (!list_empty(&ctx->heal)) { + next = list_entry(ctx->heal.next, ec_heal_t, list); + if (!next->partial) { + break; + } + list_move_tail(&next->list, &list); + } + if (list_empty(&ctx->heal)) { + next = NULL; + } + } else { + /* This is a full heal request, so take all received heal + * requests to answer them now. */ + list_splice_init(&ctx->heal, &list); + } + } + } UNLOCK(&inode->lock); + fop = heal->fop; error = fop->error; cbk = ec_cbk_data_allocate(fop->frame, heal->xl, fop, fop->id, 0, error == 0 ? 0 : -1, error); - if (cbk != NULL) - { + if (cbk != NULL) { cbk->uintptr[0] = heal->available; cbk->uintptr[1] = heal->good; cbk->uintptr[2] = heal->fixed; @@ -1097,9 +1166,7 @@ void ec_heal_dispatch(ec_heal_t * heal) ec_combine(cbk, NULL); fop->answer = cbk; - } - else if (error == 0) - { + } else if (error == 0) { error = ENOMEM; } @@ -1119,16 +1186,38 @@ void ec_heal_dispatch(ec_heal_t * heal) GF_FREE(heal); ec_fop_set_error(fop, error); + + /* Resume all pending heal requests, setting the same data obtained by + * this heal execution. */ + while (!list_empty(&list)) { + heal = list_entry(list.next, ec_heal_t, list); + list_del_init(&heal->list); + + heal->available = cbk->uintptr[0]; + heal->good = cbk->uintptr[1]; + heal->fixed = cbk->uintptr[2]; + + /* Setting 'done' to 1 avoids executing all heal logic and directly + * reports the result to the caller. */ + heal->done = 1; + + ec_resume(heal->fop, error); + } + + /* If there is a pending full request, resume it. */ + if (next != NULL) { + ec_resume(next->fop, 0); + } } void ec_wind_heal(ec_t * ec, ec_fop_data_t * fop, int32_t idx) { ec_cbk_data_t * cbk; - ec_heal_t * heal = fop->data; + ec_heal_t *heal = fop->heal; ec_trace("WIND", fop, "idx=%d", idx); - cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_HEAL, idx, + cbk = ec_cbk_data_allocate(fop->frame, fop->xl, fop, EC_FOP_HEAL, idx, fop->error == 0 ? 0 : -1, fop->error); if (cbk != NULL) { @@ -1145,7 +1234,7 @@ void ec_wind_heal(ec_t * ec, ec_fop_data_t * fop, int32_t idx) int32_t ec_manager_heal(ec_fop_data_t * fop, int32_t state) { ec_cbk_data_t * cbk; - ec_heal_t * heal = fop->data; + ec_heal_t *heal = fop->heal; switch (state) { @@ -1158,10 +1247,14 @@ int32_t ec_manager_heal(ec_fop_data_t * fop, int32_t state) return EC_STATE_REPORT; } - /* Fall through */ + return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: - ec_heal_entrylk(fop->data, ENTRYLK_LOCK); + if (heal->done) { + return EC_STATE_HEAL_DISPATCH; + } + + ec_heal_entrylk(heal, ENTRYLK_LOCK); return EC_STATE_HEAL_ENTRY_LOOKUP; @@ -1405,10 +1498,9 @@ void ec_heal(call_frame_t * frame, xlator_t * this, uintptr_t target, gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame); VALIDATE_OR_GOTO(this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); - fop = ec_fop_data_allocate(NULL, this, EC_FOP_HEAL, + fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, EC_FLAG_UPDATE_LOC_INODE, target, minimum, ec_wind_heal, ec_manager_heal, callback, data); if (fop == NULL) @@ -1457,11 +1549,11 @@ out: void ec_wind_fheal(ec_t * ec, ec_fop_data_t * fop, int32_t idx) { ec_cbk_data_t * cbk; - ec_heal_t * heal = fop->data; + ec_heal_t *heal = fop->heal; ec_trace("WIND", fop, "idx=%d", idx); - cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_FHEAL, idx, + cbk = ec_cbk_data_allocate(fop->frame, fop->xl, fop, EC_FOP_FHEAL, idx, fop->error == 0 ? 0 : -1, fop->error); if (cbk != NULL) { diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 6dae0232a01..3c3e2302e53 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -324,218 +324,242 @@ int32_t ec_loc_gfid_check(xlator_t * xl, uuid_t dst, uuid_t src) return 1; } -int32_t ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent) +int32_t ec_loc_setup_inode(xlator_t *xl, loc_t *loc) { - char * str = NULL; - int32_t error = 0; - - memset(parent, 0, sizeof(loc_t)); + int32_t ret = -1; - if (loc->inode == NULL) - { - gf_log(xl->name, GF_LOG_ERROR, "Invalid loc"); - - error = EINVAL; - - goto out; + if (loc->inode != NULL) { + if (!ec_loc_gfid_check(xl, loc->gfid, loc->inode->gfid)) { + goto out; + } + } else if (loc->parent != NULL) { + if (!uuid_is_null(loc->gfid)) { + loc->inode = inode_find(loc->parent->table, loc->gfid); + } else if (loc->path != NULL) { + loc->inode = inode_resolve(loc->parent->table, (char *)loc->path); + } } - if (__is_root_gfid(loc->inode->gfid) || __is_root_gfid(loc->gfid) || - ((loc->path != NULL) && (strcmp(loc->path, "/") == 0))) - { - parent->path = gf_strdup("/"); - if (parent->path == NULL) { - gf_log(xl->name, GF_LOG_ERROR, "Unable to duplicate path '/'"); + ret = 0; - error = ENOMEM; +out: + return ret; +} +int32_t ec_loc_setup_parent(xlator_t *xl, loc_t *loc) +{ + char *path, *parent; + int32_t ret = -1; + + if (loc->parent != NULL) { + if (!ec_loc_gfid_check(xl, loc->pargfid, loc->parent->gfid)) { goto out; } - - parent->gfid[15] = 1; - parent->inode = inode_find(loc->inode->table, parent->gfid); - - return 0; + } else if (loc->inode != NULL) { + if (!uuid_is_null(loc->pargfid)) { + loc->parent = inode_find(loc->inode->table, loc->pargfid); + } else if (loc->path != NULL) { + path = gf_strdup(loc->path); + if (path == NULL) { + gf_log(xl->name, GF_LOG_ERROR, "Unable to duplicate path '%s'", + loc->path); + + goto out; + } + parent = dirname(path); + loc->parent = inode_resolve(loc->inode->table, parent); + GF_FREE(path); + } } - if (loc->path != NULL) { - str = gf_strdup(loc->path); - if (str == NULL) - { - gf_log(xl->name, GF_LOG_ERROR, "Unable to duplicate path " - "'%s'", loc->path); + ret = 0; - error = ENOMEM; +out: + return ret; +} - goto out; - } - parent->path = gf_strdup(dirname(str)); - if (parent->path == NULL) - { - gf_log(xl->name, GF_LOG_ERROR, "Unable to get dirname of " - "'%s'", loc->path); +int32_t ec_loc_setup_path(xlator_t *xl, loc_t *loc) +{ + uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *name; + int32_t ret = -1; - error = ENOMEM; + if (loc->path != NULL) { + name = strrchr(loc->path, '/'); + if (name == NULL) { + gf_log(xl->name, GF_LOG_ERROR, "Invalid path '%s' in loc", + loc->path); goto out; } - parent->name = strrchr(parent->path, '/'); - if (parent->name == NULL) - { - gf_log(xl->name, GF_LOG_ERROR, "Invalid path name (%s)", - parent->path); + if (name == loc->path) { + if (name[1] == 0) { + if (!ec_loc_gfid_check(xl, loc->gfid, root)) { + goto out; + } + } else { + if (!ec_loc_gfid_check(xl, loc->pargfid, root)) { + goto out; + } + } + } + name++; - error = EINVAL; + if (loc->name != NULL) { + if (strcmp(loc->name, name) != 0) { + gf_log(xl->name, GF_LOG_ERROR, "Invalid name '%s' in loc", + loc->name); - goto out; + goto out; + } + } else { + loc->name = name; } - parent->name++; } + + ret = 0; + +out: + return ret; +} + +int32_t ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent) +{ + char *str = NULL; + int32_t ret = -1; + + memset(parent, 0, sizeof(loc_t)); + if (loc->parent != NULL) { parent->inode = inode_ref(loc->parent); - uuid_copy(parent->gfid, loc->parent->gfid); } - if (!uuid_is_null(loc->pargfid) && uuid_is_null(parent->gfid)) { + if (!uuid_is_null(loc->pargfid)) { uuid_copy(parent->gfid, loc->pargfid); } - - if ((parent->inode == NULL) && (parent->path != NULL)) - { - if (strcmp(parent->path, "/") == 0) { - parent->inode = inode_ref(loc->inode->table->root); + if (loc->path != NULL) { + str = gf_strdup(loc->path); + if (str == NULL) { + gf_log(xl->name, GF_LOG_ERROR, "Unable to duplicate path '%s'", + loc->path); goto out; } - parent->inode = inode_resolve(loc->inode->table, (char *)parent->path); - if (parent->inode != NULL) { - goto out; - } - - gf_log(xl->name, GF_LOG_WARNING, "Unable to resolve parent inode"); - } - - if ((parent->inode == NULL) && !uuid_is_null(parent->gfid)) { - if (__is_root_gfid(parent->gfid)) { - parent->inode = inode_ref(loc->inode->table->root); + parent->path = gf_strdup(dirname(str)); + if (parent->path == NULL) { + gf_log(xl->name, GF_LOG_ERROR, "Unable to duplicate path '%s'", + dirname(str)); goto out; } - parent->inode = inode_find(loc->inode->table, parent->gfid); - if (parent->inode != NULL) { - goto out; - } + } - gf_log(xl->name, GF_LOG_WARNING, "Unable to find parent inode"); + if ((ec_loc_setup_path(xl, parent) != 0) || + (ec_loc_setup_inode(xl, parent) != 0) || + (ec_loc_setup_parent(xl, parent) != 0)) { + goto out; } if ((parent->inode == NULL) && (parent->path == NULL) && uuid_is_null(parent->gfid)) { gf_log(xl->name, GF_LOG_ERROR, "Parent inode missing for loc_t"); - error = EINVAL; - goto out; } + ret = 0; + out: GF_FREE(str); - if (error != 0) + if (ret != 0) { loc_wipe(parent); } - return error; + return ret; } -int32_t ec_loc_prepare(xlator_t * xl, loc_t * loc, inode_t * inode, - struct iatt * iatt) +int32_t ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, + struct iatt *iatt) { - if ((inode != NULL) && (loc->inode != inode)) - { - if (loc->inode != NULL) - { + int32_t ret = -1; + + if ((inode != NULL) && (loc->inode != inode)) { + if (loc->inode != NULL) { inode_unref(loc->inode); } loc->inode = inode_ref(inode); - uuid_copy(loc->gfid, inode->gfid); } - else if (loc->inode != NULL) - { - if (!ec_loc_gfid_check(xl, loc->gfid, loc->inode->gfid)) - { - return 0; - } - } - if (iatt != NULL) - { - if (!ec_loc_gfid_check(xl, loc->gfid, iatt->ia_gfid)) - { - return 0; + if (iatt != NULL) { + if (!ec_loc_gfid_check(xl, loc->gfid, iatt->ia_gfid)) { + goto out; } } - if (loc->parent != NULL) - { - if (!ec_loc_gfid_check(xl, loc->pargfid, loc->parent->gfid)) - { - return 0; - } - + if ((ec_loc_setup_path(xl, loc) != 0) || + (ec_loc_setup_inode(xl, loc) != 0) || + (ec_loc_setup_parent(xl, loc) != 0)) { + goto out; } - if (uuid_is_null(loc->gfid)) - { - gf_log(xl->name, GF_LOG_WARNING, "GFID not available for inode"); - } + ret = 0; - return 1; +out: + return ret; } int32_t ec_loc_from_fd(xlator_t * xl, loc_t * loc, fd_t * fd) { ec_fd_t * ctx; + int32_t ret = -1; memset(loc, 0, sizeof(*loc)); ctx = ec_fd_get(fd, xl); - if (ctx != NULL) - { - if (loc_copy(loc, &ctx->loc) != 0) - { - return 0; + if (ctx != NULL) { + if (loc_copy(loc, &ctx->loc) != 0) { + goto out; } } - if (ec_loc_prepare(xl, loc, fd->inode, NULL)) - { - return 1; + if (ec_loc_update(xl, loc, fd->inode, NULL) != 0) { + goto out; } - loc_wipe(loc); + ret = 0; - return 0; +out: + if (ret != 0) { + loc_wipe(loc); + } + + return ret; } int32_t ec_loc_from_loc(xlator_t * xl, loc_t * dst, loc_t * src) { + int32_t ret = -1; + memset(dst, 0, sizeof(*dst)); - if (loc_copy(dst, src) != 0) - { - return 0; + if (loc_copy(dst, src) != 0) { + goto out; } - if (ec_loc_prepare(xl, dst, NULL, NULL)) - { - return 1; + if (ec_loc_update(xl, dst, NULL, NULL) != 0) { + goto out; } - loc_wipe(dst); + ret = 0; - return 0; +out: + if (ret != 0) { + loc_wipe(dst); + } + + return ret; } void ec_owner_set(call_frame_t * frame, void * owner) @@ -560,6 +584,7 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl) if (ctx != NULL) { memset(ctx, 0, sizeof(*ctx)); + INIT_LIST_HEAD(&ctx->heal); value = (uint64_t)(uintptr_t)ctx; if (__inode_ctx_set(inode, xl, &value) != 0) diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index d654a49f34c..46791041a3f 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -30,8 +30,8 @@ int32_t ec_dict_set_config(dict_t * dict, char * key, ec_config_t * config); int32_t ec_dict_del_config(dict_t * dict, char * key, ec_config_t * config); int32_t ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent); -int32_t ec_loc_prepare(xlator_t * xl, loc_t * loc, inode_t * inode, - struct iatt * iatt); +int32_t ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, + struct iatt *iatt); int32_t ec_loc_from_fd(xlator_t * xl, loc_t * loc, fd_t * fd); int32_t ec_loc_from_loc(xlator_t * xl, loc_t * dst, loc_t * src); diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index 88e9661743d..f9d1bcb052c 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -344,6 +344,59 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state) } } +int32_t ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, + int32_t op_ret, int32_t op_errno, uintptr_t mask, + uintptr_t good, uintptr_t bad, dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + fop_getxattr_cbk_t func = fop->data; + ec_t *ec = xl->private; + dict_t *dict = NULL; + char *str; + char bin1[65], bin2[65]; + + if (op_ret >= 0) { + dict = dict_new(); + if (dict == NULL) { + op_ret = -1; + op_errno = ENOMEM; + } else { + if (gf_asprintf(&str, "Good: %s, Bad: %s", + ec_bin(bin1, sizeof(bin1), good, ec->nodes), + ec_bin(bin2, sizeof(bin2), mask & ~(good | bad), + ec->nodes)) < 0) { + dict_unref(dict); + dict = NULL; + + op_ret = -1; + op_errno = ENOMEM; + + goto out; + } + + if (dict_set_str(dict, EC_XATTR_HEAL, str) != 0) { + GF_FREE(str); + dict_unref(dict); + dict = NULL; + + op_ret = -1; + op_errno = ENOMEM; + + goto out; + } + } + } + +out: + func(frame, NULL, xl, op_ret, op_errno, dict, NULL); + + if (dict != NULL) { + dict_unref(dict); + } + + return 0; +} + void ec_getxattr(call_frame_t * frame, xlator_t * this, uintptr_t target, int32_t minimum, fop_getxattr_cbk_t func, void * data, loc_t * loc, const char * name, dict_t * xdata) @@ -358,6 +411,14 @@ void ec_getxattr(call_frame_t * frame, xlator_t * this, uintptr_t target, GF_VALIDATE_OR_GOTO(this->name, frame, out); GF_VALIDATE_OR_GOTO(this->name, this->private, out); + /* Special handling of an explicit self-heal request */ + if ((name != NULL) && (strcmp(name, EC_XATTR_HEAL) == 0)) { + ec_heal(frame, this, target, EC_MINIMUM_ONE, ec_getxattr_heal_cbk, + func, loc, 0, NULL); + + return; + } + fop = ec_fop_data_allocate(frame, this, GF_FOP_GETXATTR, EC_FLAG_UPDATE_LOC_INODE, target, minimum, ec_wind_getxattr, ec_manager_getxattr, callback, @@ -650,9 +711,8 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state) LOCK(&fop->fd->lock); ctx = __ec_fd_get(fop->fd, fop->xl); - if ((ctx == NULL) || !ec_loc_from_loc(fop->xl, &ctx->loc, - &fop->loc[0])) - { + if ((ctx == NULL) || + (ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0])) != 0) { UNLOCK(&fop->fd->lock); fop->error = EIO; @@ -692,24 +752,24 @@ int32_t ec_manager_open(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { - ec_fop_set_error(fop, cbk->op_errno); - } - else - { - ec_loc_prepare(fop->xl, &fop->loc[0], cbk->fd->inode, - NULL); + if (cbk->op_ret >= 0) { + if (ec_loc_update(fop->xl, &fop->loc[0], cbk->fd->inode, + NULL) != 0) { + cbk->op_ret = -1; + cbk->op_errno = EIO; + } else { + LOCK(&fop->fd->lock); - LOCK(&fop->fd->lock); + ctx = __ec_fd_get(fop->fd, fop->xl); + if (ctx != NULL) { + ctx->open |= cbk->mask; + } - ctx = __ec_fd_get(fop->fd, fop->xl); - if (ctx != NULL) - { - ctx->open |= cbk->mask; + UNLOCK(&fop->fd->lock); } - - UNLOCK(&fop->fd->lock); + } + if (cbk->op_ret < 0) { + ec_fop_set_error(fop, cbk->op_errno); } } else diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 37ce3fac089..c7db6226e25 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -17,6 +17,7 @@ #define EC_XATTR_CONFIG "trusted.ec.config" #define EC_XATTR_SIZE "trusted.ec.size" #define EC_XATTR_VERSION "trusted.ec.version" +#define EC_XATTR_HEAL "trusted.ec.heal" struct _ec; typedef struct _ec ec_t; |