diff options
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 253 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-types.h | 2 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.c | 77 |
4 files changed, 239 insertions, 94 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index cb627a92c9c..fbc7ac97aa0 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link) return _gf_true; } +static ec_lock_link_t * +ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock) +{ + ec_lock_link_t *timer_link; + + /* If we don't have any timer, there's nothing to cancel. */ + if (lock->timer == NULL) { + return NULL; + } + + /* We are trying to access a lock that has an unlock timer active. + * This means that the lock must be idle, i.e. no fop can be in the + * owner, waiting or frozen lists. It also means that the lock cannot + * have been marked as being released (this is done without timers). + * There should only be one owner reference, but it's possible that + * some fops are being prepared to use this lock. */ + GF_ASSERT ((lock->refs_owners == 1) && + list_empty(&lock->owners) && list_empty(&lock->waiting)); + + /* We take the timer_link before cancelling the timer, since a + * successful cancellation will destroy it. It must not be NULL + * because it references the fop responsible for the delayed unlock + * that we are currently trying to cancel. */ + timer_link = lock->timer->data; + GF_ASSERT(timer_link != NULL); + + if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) { + /* It's too late to avoid the execution of the timer callback. + * Since we need to be sure that the callback has access to all + * needed resources, we cannot resume the execution of the + * timer fop now. This will be done in the callback. */ + timer_link = NULL; + } else { + /* The timer has been cancelled. The fop referenced by + * timer_link holds the last reference. The caller is + * responsible to release it when not needed anymore. */ + ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + } + + /* We have two options here: + * + * 1. The timer has been successfully cancelled. + * + * This is the easiest case and we can continue with the currently + * acquired lock. + * + * 2. The timer callback has already been fired. + * + * In this case we have not been able to cancel the timer before + * the timer callback has been fired, but we also know that + * lock->timer != NULL. This means that the timer callback is still + * trying to acquire the inode mutex that we currently own. We are + * safe until we release it. In this case we can safely clear + * lock->timer. This will cause that the timer callback does nothing + * once it acquires the mutex. + */ + lock->timer = NULL; + + return timer_link; +} + static gf_boolean_t ec_lock_assign_owner(ec_lock_link_t *link) { @@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link) * empty. */ GF_ASSERT(list_empty(&lock->frozen)); - if (lock->timer != NULL) { - /* We are trying to acquire a lock that has an unlock timer active. - * This means that the lock must be idle, i.e. no fop can be in the - * owner, waiting or frozen lists. It also means that the lock cannot - * have been marked as being released (this is done without timers). - * There should only be one owner reference, but it's possible that - * some fops are being prepared to use this lock. - */ - GF_ASSERT ((lock->refs_owners == 1) && - list_empty(&lock->owners) && list_empty(&lock->waiting)); - - /* We take the timer_link before cancelling the timer, since a - * successful cancellation will destroy it. It must not be NULL - * because it references the fop responsible for the delayed unlock - * that we are currently trying to cancel. */ - timer_link = lock->timer->data; - GF_ASSERT(timer_link != NULL); - - if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) { - /* It's too late to avoid the execution of the timer callback. - * Since we need to be sure that the callback has access to all - * needed resources, we cannot resume the execution of the timer - * fop now. This will be done in the callback. - */ - timer_link = NULL; - } else { - /* The timer has been cancelled, so we need to release the owner - * reference that was held by the fop waiting for the timer. This - * can be the last reference, but we'll immediately increment it - * for the current fop, so no need to check it. - */ - lock->refs_owners--; - - ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); - } - - /* We have two options here: - * - * 1. The timer has been successfully cancelled. - * - * This is the easiest case and we can continue with the currently - * acquired lock. - * - * 2. The timer callback has already been fired. - * - * In this case we have not been able to cancel the timer before - * the timer callback has been fired, but we also know that - * lock->timer != NULL. This means that the timer callback is still - * trying to acquire the inode mutex that we currently own. We are - * safe until we release it. In this case we can safely clear - * lock->timer. This will cause that the timer callback does nothing - * once it acquires the mutex. - */ - lock->timer = NULL; - } + timer_link = ec_lock_timer_cancel(fop->xl, lock); if (!list_empty(&lock->owners)) { /* There are other owners of this lock. We can only take ownership if @@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link) } list_add_tail(&link->owner_list, &lock->owners); - lock->refs_owners++; + + /* If timer_link is not NULL, it means that we have inherited the owner + * reference assigned to the timer fop. In this case we simply reuse it. + * Otherwise we need to increase the number of owners. */ + if (timer_link == NULL) { + lock->refs_owners++; + } assigned = _gf_true; @@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link) ec_resume(link->fop, 0); } +void +ec_lock_release(ec_t *ec, inode_t *inode) +{ + ec_lock_t *lock; + ec_inode_t *ctx; + ec_lock_link_t *timer_link = NULL; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, ec->xl); + if (ctx == NULL) { + goto done; + } + lock = ctx->inode_lock; + if ((lock == NULL) || !lock->acquired || lock->release) { + goto done; + } + + gf_msg_debug(ec->xl->name, 0, + "Releasing inode %p due to lock contention", inode); + + /* The lock is not marked to be released, so the frozen list should be + * empty. */ + GF_ASSERT(list_empty(&lock->frozen)); + + timer_link = ec_lock_timer_cancel(ec->xl, lock); + + /* We mark the lock to be released as soon as possible. */ + lock->release = _gf_true; + +done: + UNLOCK(&inode->lock); + + /* If we have cancelled the timer, we need to start the unlock of the + * inode. If there was a timer but we have been unable to cancel it + * because it was just triggered, the timer callback will take care + * of releasing the inode. */ + if (timer_link != NULL) { + ec_unlock_now(timer_link); + } +} + void ec_unlock_timer_add(ec_lock_link_t *link); void @@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data) ec_unlock_timer_del(data); } +static gf_boolean_t +ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop) +{ + /* Fops with no locks at this point mean that they are sent as sub-fops + * of other higher level fops. In this case we simply assume that the + * parent fop will take correct care of the eager lock. */ + if (fop->lock_count == 0) { + return _gf_true; + } + + /* We may have more than one lock, but this only happens in the rename + * fop, and both locks will reference an inode of the same type (a + * directory in this case), so we only need to check the first lock. */ + if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock; + } + + return ec->other_eager_lock; +} + +static uint32_t +ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock) +{ + if (lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock_timeout; + } + + return ec->other_eager_lock_timeout; +} + +static gf_boolean_t +ec_lock_delay_create(ec_lock_link_t *link) +{ + struct timespec delay; + ec_fop_data_t *fop = link->fop; + ec_lock_t *lock = link->lock; + + delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock); + delay.tv_nsec = 0; + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, + ec_unlock_timer_cbk, link); + if (lock->timer == NULL) { + gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, + EC_MSG_UNLOCK_DELAY_FAILED, + "Unable to delay an unlock"); + + return _gf_false; + } + + return _gf_true; +} + void ec_unlock_timer_add(ec_lock_link_t *link) { - struct timespec delay; ec_fop_data_t *fop = link->fop; ec_lock_t *lock = link->lock; gf_boolean_t now = _gf_false; @@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link) ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, lock->release); - delay.tv_sec = 1; - delay.tv_nsec = 0; - lock->timer = gf_timer_call_after(fop->xl->ctx, delay, - ec_unlock_timer_cbk, link); - if (lock->timer == NULL) { - gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, - EC_MSG_UNLOCK_DELAY_FAILED, - "Unable to delay an unlock"); - + if (!ec_lock_delay_create(link)) { /* We are unable to create a new timer. We immediately release * the lock. */ lock->release = now = _gf_true; } + } else { ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, lock->release); @@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop) ec_update_info(&fop->locks[0]); } -static gf_boolean_t -ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) -{ - /* Fops with no locks at this point mean that they are sent as sub-fops - * of other higher level fops. In this case we simply assume that the - * parent fop will take correct care of the eager lock. */ - if (fop->lock_count == 0) { - return _gf_true; - } - - /* We may have more than one lock, but this only happens in the rename - * fop, and both locks will reference an inode of the same type (a - * directory in this case), so we only need to check the first lock. */ - if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { - return ec->eager_lock; - } - - return ec->other_eager_lock; -} - static void ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe, ec_fop_data_t *fop) @@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop) ec = fop->xl->private; cbk = fop->answer; - if (ec_use_eager_lock(ec, fop) && cbk != NULL) { + if (ec_eager_lock_used(ec, fop) && cbk != NULL) { if (cbk->xdata != NULL) { if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) == 0) && (count > 1)) { diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index c3e291585ef..99e2f0653be 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, void ec_lock(ec_fop_data_t * fop); void ec_lock_reuse(ec_fop_data_t *fop); void ec_unlock(ec_fop_data_t * fop); +void ec_lock_release(ec_t *ec, inode_t *inode); gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size); diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 23dc434bc42..15b4c77abfe 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -669,6 +669,8 @@ struct _ec { uint32_t background_heals; uint32_t heal_wait_qlen; uint32_t self_heal_window_size; /* max size of read/writes */ + uint32_t eager_lock_timeout; + uint32_t other_eager_lock_timeout; struct list_head pending_fops; struct list_head heal_waiting; struct list_head healing; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 4c80f1283f1..30b0bdcb29c 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options) bool, failed); GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options, bool, failed); + GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout, + options, uint32, failed); + GF_OPTION_RECONF ("other-eager-lock-timeout", + ec->other_eager_lock_timeout, options, uint32, + failed); GF_OPTION_RECONF ("background-heals", background_heals, options, uint32, failed); GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options, @@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) } } +static gf_boolean_t +ec_upcall(ec_t *ec, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *ci = NULL; + struct gf_upcall_inodelk_contention *lc = NULL; + inode_t *inode; + + switch (upcall->event_type) { + case GF_UPCALL_CACHE_INVALIDATION: + ci = upcall->data; + ci->flags |= UP_INVAL_ATTR; + return _gf_true; + + case GF_UPCALL_INODELK_CONTENTION: + lc = upcall->data; + if (strcmp(lc->domain, ec->xl->name) != 0) { + /* The lock is not owned by EC, ignore it. */ + return _gf_true; + } + inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable, + upcall->gfid); + /* If inode is not found, it means that it's already released, + * so we can ignore it. Probably it has been released and + * destroyed while the contention notification was being sent. + */ + if (inode != NULL) { + ec_lock_release(ec, inode); + inode_unref(inode); + } + + return _gf_false; + + default: + return _gf_true; + } +} + int32_t ec_notify (xlator_t *this, int32_t event, void *data, void *data2) { @@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) dict_t *output = NULL; gf_boolean_t propagate = _gf_true; int32_t orig_event = event; - struct gf_upcall *up_data = NULL; - struct gf_upcall_cache_invalidation *up_ci = NULL; uintptr_t mask = 0; gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p", event, data, data2); if (event == GF_EVENT_UPCALL) { - up_data = (struct gf_upcall *)data; - if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) { - up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - up_ci->flags |= UP_INVAL_ATTR; - } + propagate = ec_upcall(ec, data); goto done; } @@ -664,6 +700,10 @@ init (xlator_t *this) GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed); GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed); GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed); + GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32, + failed); + GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout, + uint32, failed); GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed); GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed); GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32, @@ -1456,6 +1496,29 @@ struct volume_options options[] = .description = "It's equivalent to the eager-lock option but for non " "regular files." }, + { .key = {"eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = { GD_OP_VERSION_4_0_0 }, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = { "disperse", "locks", "timeout" }, + .description = "Maximum time (in seconds) that a lock on an inode is " + "kept held if no new operations on the inode are " + "received." + }, + { .key = {"other-eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = { GD_OP_VERSION_4_0_0 }, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = { "disperse", "locks", "timeout" }, + .description = "It's equivalent ot eager-lock-timeout option but for " + "non regular files." + }, { .key = {"background-heals"}, .type = GF_OPTION_TYPE_INT, .min = 0,/*Disabling background heals*/ |