diff options
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 242 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 25 |
2 files changed, 192 insertions, 75 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index de0e597d124..58cfc732ced 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -711,8 +711,7 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0; link->base = base; - lock->refs++; - lock->inserted++; + lock->refs_pending++; } void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, @@ -1347,6 +1346,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list) list_move_tail(&link->wait_list, list); list_add_tail(&link->owner_list, &lock->owners); + lock->refs_owners++; ec_lock_update_fd(lock, fop); } @@ -1478,6 +1478,8 @@ ec_lock_assign_owner(ec_lock_link_t *link) ec_lock_link_t *timer_link = NULL; gf_boolean_t assigned = _gf_false; + /* The link cannot be in any list because we have just finished preparing + * it. */ GF_ASSERT(list_empty(&link->wait_list)); fop = link->fop; @@ -1485,27 +1487,85 @@ ec_lock_assign_owner(ec_lock_link_t *link) LOCK(&lock->loc.inode->lock); - GF_ASSERT (lock->inserted > 0); - lock->inserted--; + /* Since the link has just been prepared but it's not active yet, the + * refs_pending must be one at least (the ref owned by this link). */ + GF_ASSERT (lock->refs_pending > 0); + /* The link is not pending any more. It will be assigned to the owner, + * waiting or frozen list. */ + lock->refs_pending--; if (lock->release) { ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock); - list_add_tail(&link->wait_list, &lock->frozen); + /* When lock->release is set, we'll unlock the lock as soon as + * possible, meaning that we won't use a timer. */ + GF_ASSERT(lock->timer == NULL); - /* The lock is frozen, so we move the current reference to refs_frozen. - * After that, there should remain at least one ref belonging to the - * lock that is processing the release. */ - lock->refs--; - GF_ASSERT(lock->refs > 0); - lock->refs_frozen++; + /* The lock is marked to be released. We can still have owners and fops + * in the waiting ilist f they have been added before the lock has been + * marked to be released. However new fops are put into the frozen list + * to wait for the next unlock/lock cycle. */ + list_add_tail(&link->wait_list, &lock->frozen); goto unlock; } + /* The lock is not marked to be released, so the frozen list should be + * empty. */ + GF_ASSERT(list_empty(&lock->frozen)); + + if (lock->timer != NULL) { + /* We are trying to acquire a lock that has an unlock timer active. + * This means that the lock must be idle, i.e. no fop can be in the + * owner, waiting or frozen lists. It also means that the lock cannot + * have been marked as being released (this is done without timers) + * and it must not be exclusive. There should only be one owner + * reference, but it's possible that some fops are being prepared to + * use this lock. */ + GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) && + list_empty(&lock->owners) && list_empty(&lock->waiting)); + + /* We take the timer_link before cancelling the timer, since a + * successful cancellation will destroy it. It must not be NULL + * because it references the fop responsible for the delayed unlock + * that we are currently trying to cancel. */ + timer_link = lock->timer->data; + GF_ASSERT(timer_link != NULL); + + gf_timer_call_cancel(fop->xl->ctx, lock->timer); + ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + + /* We have two options here: + * + * 1. The timer has been successfully cancelled. + * + * This is the easiest case and we can continue with the currently + * acquired lock. + * + * 2. The timer callback has already been fired. + * + * In this case we have not been able to cancel the timer before + * the timer callback has been fired, but we also know that + * lock->timer != NULL. This means that the timer callback is still + * trying to acquire the inode mutex that we currently own. We are + * safe until we release it. In this case we can safely clear + * lock->timer. This will cause that the timer callback does nothing + * once it acquires the mutex. + * + * In both cases we must release the owner reference assigned to the + * fop that was handling the unlock because ec_unlock_now() won't be + * called for that fop. + */ + lock->timer = NULL; + lock->refs_owners--; + } + lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0; if (!list_empty(&lock->owners)) { + /* There are other owners of this lock. We can only take ownership if + * the lock is already acquired and can be shared. Otherwise we need + * to wait. */ if (!lock->acquired || (lock->exclusive != 0)) { ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); @@ -1513,36 +1573,24 @@ ec_lock_assign_owner(ec_lock_link_t *link) goto unlock; } - } else if (lock->timer != NULL) { - GF_ASSERT (lock->release == _gf_false); - - timer_link = lock->timer->data; - if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) { - ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); - lock->timer = NULL; - lock->refs--; - /* There should remain at least 1 ref, the current one. */ - GF_ASSERT(lock->refs > 0); - } else { - /* Timer expired and on the way to unlock. - * Set lock->release to _gf_true, so that this - * lock will be put in frozen list*/ - timer_link = NULL; - lock->release = _gf_true; - } } list_add_tail(&link->owner_list, &lock->owners); + lock->refs_owners++; assigned = _gf_true; unlock: if (!assigned) { + /* We have not been able to take ownership of this lock. The fop must + * be put to sleep. */ ec_sleep(fop); } UNLOCK(&lock->loc.inode->lock); + /* If we have cancelled the timer, we need to resume the fop that was + * waiting for it. */ if (timer_link != NULL) { ec_resume(timer_link->fop, 0); } @@ -1566,8 +1614,14 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk, ec_trace("LOCK_DONE", fop, "lock=%p", lock); - GF_ASSERT(!list_empty(&link->owner_list)); + /* Current link must belong to the owner list of the lock. We don't + * decrement lock->refs_owners here because the inode mutex is released + * before ec_unlock() is called and we need to know when the last owner + * unlocks the lock to do proper cleanup. lock->refs_owners is used for + * this task. */ + GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list)); list_del_init(&link->owner_list); + lock->release |= release; if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) { @@ -1625,6 +1679,7 @@ ec_lock_unfreeze(ec_lock_link_t *link) { struct list_head list; ec_lock_t *lock; + gf_boolean_t destroy = _gf_false; lock = link->lock; @@ -1632,18 +1687,30 @@ ec_lock_unfreeze(ec_lock_link_t *link) LOCK(&lock->loc.inode->lock); - lock->acquired = _gf_false; + /* The lock must be marked to be released here, since we have just released + * it and any attempt to assign it to more fops must have added them to the + * frozen list. We can only have one active reference here: the one that + * is processing this unfreeze. */ + GF_ASSERT(lock->release && (lock->refs_owners == 1)); lock->release = _gf_false; - lock->refs--; + lock->refs_owners = 0; - GF_ASSERT (lock->refs == lock->inserted); - GF_ASSERT(lock->exclusive == 0); - GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners)); + lock->acquired = _gf_false; + + /* We are unfreezing a lock. This means that the lock has already been + * released. In this state it shouldn't be exclusive nor have a pending + * timer nor have any owner, and the waiting list should be empty. Only + * the frozen list can contain some fop. */ + GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) && + list_empty(&lock->waiting) && list_empty(&lock->owners)); + /* We move all frozen fops to the waiting list. */ list_splice_init(&lock->frozen, &lock->waiting); - lock->refs += lock->refs_frozen; - lock->refs_frozen = 0; - if (lock->refs == 0) { + + /* If we don't have any fop waiting nor there are any prepared fops using + * this lock, we can finally dispose it. */ + destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0); + if (destroy) { ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock); lock->ctx->inode_lock = NULL; @@ -1657,7 +1724,7 @@ ec_lock_unfreeze(ec_lock_link_t *link) ec_lock_resume_shared(&list); - if (lock->refs == 0) { + if (destroy) { ec_lock_destroy(lock); } } @@ -1770,9 +1837,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version, fop = link->fop; - GF_ASSERT(version[0] < 0x100000000); - GF_ASSERT(version[1] < 0x100000000); - ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld", version[0], version[1], size, dirty[0], dirty[1]); @@ -1814,7 +1878,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version, } } - /* If config information is not know, we request it now. */ + /* If config information is not known, we request it now. */ if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) { /* A failure requesting this xattr is ignored because it's not * absolutely required right now. */ @@ -1850,6 +1914,13 @@ out: gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL, "Unable to update version and size"); + + if ((fop->parent->id != GF_FOP_FLUSH) && + (fop->parent->id != GF_FOP_FSYNC) && + (fop->parent->id != GF_FOP_FSYNCDIR)) { + ec_unlock_lock(fop->data); + } + } gf_boolean_t @@ -1900,7 +1971,6 @@ ec_unlock_now(ec_lock_link_t *link) void ec_unlock_timer_del(ec_lock_link_t *link) { - int32_t before = 0; ec_lock_t *lock; inode_t *inode; gf_boolean_t now = _gf_false; @@ -1922,22 +1992,24 @@ ec_unlock_timer_del(ec_lock_link_t *link) if (lock->timer != NULL) { ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock); + /* The unlock timer has expired without anyone cancelling it. + * This means that it shouldn't have any owner, and the + * waiting and frozen lists should be empty. It shouldn't have + * been marked as release nor be exclusive either. It must have + * only one owner reference, but there can be fops being + * prepared though. */ + GF_ASSERT(!lock->release && (lock->exclusive == 0) && + (lock->refs_owners == 1) && + list_empty(&lock->owners) && + list_empty(&lock->waiting) && + list_empty(&lock->frozen)); + gf_timer_call_cancel(link->fop->xl->ctx, lock->timer); lock->timer = NULL; + /* Any fop being processed from now on, will need to wait + * until the next unlock/lock cycle. */ lock->release = now = _gf_true; - - /* TODO: If the assertion is really true, following code is - * not needed. */ - GF_ASSERT(list_empty(&lock->waiting)); - - before = lock->refs + lock->refs_frozen; - list_splice_init(&lock->waiting, &lock->frozen); - lock->refs_frozen += lock->refs - lock->inserted - 1; - lock->refs = 1 + lock->inserted; - /* We moved around the locks, so total number of locks shouldn't - * change by this operation*/ - GF_ASSERT (before == (lock->refs + lock->refs_frozen)); } UNLOCK(&inode->lock); @@ -1961,24 +2033,50 @@ void ec_unlock_timer_add(ec_lock_link_t *link) LOCK(&lock->loc.inode->lock); - GF_ASSERT(lock->timer == NULL); + /* We are trying to unlock the lock. We can have multiple scenarios here, + * but all of them need to have lock->timer == NULL: + * + * 1. There are other owners currently running that can call ec_unlock(). + * + * None of them can have started the timer until the last one. But this + * call should be the consequence of this lastest one. + * + * 2. There are fops in the waiting or frozen lists. + * + * These fops cannot call ec_unlock(). So we should be here. + * + * We must reach here with at least one owner reference. + */ + GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0)); - if ((lock->refs - lock->inserted) > 1) { + /* If the fop detects that a heal is needed, we mark the lock to be + * released as soon as possible. */ + lock->release |= ec_fop_needs_heal(fop); + + if (lock->refs_owners > 1) { ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock); - lock->refs--; + /* If there are other owners we cannot do anything else with the lock. + * Note that the current fop has already been removed from the owners + * list in ec_lock_reuse(). */ + lock->refs_owners--; UNLOCK(&lock->loc.inode->lock); } else if (lock->acquired) { - ec_t *ec = fop->xl->private; + /* There are no other owners and the lock is acquired. If there were + * fops waiting, at least one of them should have been promoted to an + * owner, so the waiting list should be empty. */ + GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); - GF_ASSERT(list_empty(&lock->owners)); + ec_t *ec = fop->xl->private; + /* If everything goes as expected this fop will be put to sleep until + * the timer callback is executed. */ ec_sleep(fop); - /* If healing is needed, the lock needs to be released due to - * contention, or ec is shutting down, do not delay lock release. */ - if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) { + /* If the lock needs to be released, or ec is shutting down, do not + * delay lock release. */ + if (!lock->release && !ec->shutdown) { ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, lock->release); @@ -1989,9 +2087,10 @@ void ec_unlock_timer_add(ec_lock_link_t *link) if (lock->timer == NULL) { gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, EC_MSG_UNLOCK_DELAY_FAILED, - "Unable to delay an " - "unlock"); + "Unable to delay an unlock"); + /* We are unable to create a new timer. We immediately release + * the lock. */ lock->release = now = _gf_true; } } else { @@ -2006,10 +2105,17 @@ void ec_unlock_timer_add(ec_lock_link_t *link) ec_unlock_now(link); } } else { + /* There are no owners and the lock is not acquired. This can only + * happen if a lock attempt has failed and we get to the unlock step + * of the fop. As in the previous case, the waiting list must be + * empty. */ + GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting)); + + /* We need to mark the lock to be released to correctly handle fops + * that may get in after we release the inode mutex but before + * ec_lock_unfreeze() is processed. */ lock->release = _gf_true; - GF_ASSERT(list_empty(&lock->owners)); - UNLOCK(&lock->loc.inode->lock); ec_lock_unfreeze(link); @@ -2052,7 +2158,7 @@ void ec_lock_reuse(ec_fop_data_t *fop) } } } else { - /* If eager lock is disabled or If we haven't get + /* If eager lock is disabled or if we haven't get * an answer with enough quorum, we always release * the lock. */ release = _gf_true; diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9107b4b156e..f4214ecfed7 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -139,17 +139,28 @@ struct _ec_lock { ec_inode_t *ctx; gf_timer_t *timer; - struct list_head owners; /* List of owners of this lock. */ - struct list_head waiting; /* Queue of requests being serviced. */ - struct list_head frozen; /* Queue of requests that will be serviced in - the next unlock/lock cycle. */ + + /* List of owners of this lock. All fops added to this list are running + * concurrently. */ + struct list_head owners; + + /* List of fops waiting to be an owner of the lock. Fops are added to this + * list when the current owner has an incompatible access (shared vs + * exclusive) or the lock is not acquired yet. */ + struct list_head waiting; + + /* List of fops that will wait until the next unlock/lock cycle. This + * happens when the currently acquired lock is decided to be released as + * soon as possible. In this case, all frozen fops will be continued only + * after the lock is reacquired. */ + struct list_head frozen; + int32_t exclusive; uintptr_t mask; uintptr_t good_mask; uintptr_t healing; - int32_t refs; - int32_t refs_frozen; - int32_t inserted; + uint32_t refs_owners; /* Refs for fops owning the lock */ + uint32_t refs_pending; /* Refs assigned to fops being prepared */ gf_boolean_t acquired; gf_boolean_t getting_size; gf_boolean_t release; |