diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2014-01-09 16:55:11 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-04-28 09:44:58 -0700 |
commit | 800258b54a4a776430410eb949cfded147c4ae8a (patch) | |
tree | 81bd10a6e873868822e4271c43ddfe90f19c6132 /xlators | |
parent | 26843b00447cc14427b4f02ca136033b56ca093f (diff) |
cluster/afr: Prevent heal info hang when data-self-heal in progress.
Problem:
For determining whether data-self-heal is needed afr takes blocking
locks. So if self-heal is indeed in progress on the file, this leads
to hangs. heal info hung for almost 50 minutes when a 50G file
is undergoing heal.
Fix:
When self-heal is in progress there is a live self-heal-domain lock.
In this stage if a non-blocking inodelk for self-heal-domain lock
is performed it will fail with EAGAIN. For heal info we can use this
logic to determing that the file is possibly undergoing heal and inform
it to user instead of waiting for the completion of self-heal.
Change-Id: I18527c59e429602bae49c98ff45502833ab8e1f0
BUG: 1039544
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/7482
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-lk-common.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 1 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 21 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 2 |
5 files changed, 25 insertions, 3 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 1e57ebb9d7c..6a453060c9e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1822,6 +1822,8 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, if (ret) gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " "sh-failed to %d", local->loc.path, sh_failed); + ret = dict_set_int32 (xattr, "possibly-healing", + local->self_heal.possibly_healing); } else { ret = dict_set_int32 (xattr, "metadata-self-heal-pending", local->self_heal.metadata_sh_pending); diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 060d78f3505..c492114c6f6 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1432,6 +1432,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { if (op_ret < 0) { + int_lock->lock_op_errno = op_errno; if (op_errno == ENOSYS) { /* return ENOTSUP */ gf_log (this->name, GF_LOG_ERROR, @@ -1440,7 +1441,6 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "server"); local->op_ret = op_ret; int_lock->lock_op_ret = op_ret; - int_lock->lock_op_errno = op_errno; local->op_errno = op_errno; } if (local->transaction.eager_lock) diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 4916bf45d68..8861870372d 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2360,6 +2360,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) orig_frame_sh->entry_sh_pending = sh->entry_sh_pending; orig_frame_sh->data_sh_pending = sh->data_sh_pending; orig_frame_sh->metadata_sh_pending = sh->metadata_sh_pending; + orig_frame_sh->possibly_healing = sh->possibly_healing; sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, is_self_heal_failed (sh, AFR_CHECK_ALL)); } diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 02c91c95710..e740ac9a308 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1418,6 +1418,22 @@ afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_lock_failure (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_internal_lock_t *int_lock = NULL; + + local = frame->local; + sh = &local->self_heal; + int_lock = &local->internal_lock; + if (EAGAIN == int_lock->lock_op_errno) + sh->possibly_healing = _gf_true; + afr_sh_data_fail (frame, this); + return 0; +} + +int afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1612,8 +1628,9 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, - afr_sh_dom_lock_success, afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, !sh->dry_run, + priv->sh_domain, afr_sh_dom_lock_success, + afr_sh_dom_lock_failure); } return 0; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index ad8964ccbaa..49ca64c75c1 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -288,6 +288,8 @@ struct afr_self_heal_ { int32_t dry_run; gf_boolean_t metadata_sh_pending; + gf_boolean_t possibly_healing; //set when it is detected + //that a self-heal is in progress gf_boolean_t data_sh_pending; gf_boolean_t entry_sh_pending; |