summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/ec/src/ec-common.c242
-rw-r--r--xlators/cluster/ec/src/ec-data.h25
2 files changed, 192 insertions, 75 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index de0e597d124..58cfc732ced 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -711,8 +711,7 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
link->base = base;
- lock->refs++;
- lock->inserted++;
+ lock->refs_pending++;
}
void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
@@ -1347,6 +1346,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
list_move_tail(&link->wait_list, list);
list_add_tail(&link->owner_list, &lock->owners);
+ lock->refs_owners++;
ec_lock_update_fd(lock, fop);
}
@@ -1478,6 +1478,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
ec_lock_link_t *timer_link = NULL;
gf_boolean_t assigned = _gf_false;
+ /* The link cannot be in any list because we have just finished preparing
+ * it. */
GF_ASSERT(list_empty(&link->wait_list));
fop = link->fop;
@@ -1485,27 +1487,85 @@ ec_lock_assign_owner(ec_lock_link_t *link)
LOCK(&lock->loc.inode->lock);
- GF_ASSERT (lock->inserted > 0);
- lock->inserted--;
+ /* Since the link has just been prepared but it's not active yet, the
+ * refs_pending must be one at least (the ref owned by this link). */
+ GF_ASSERT (lock->refs_pending > 0);
+ /* The link is not pending any more. It will be assigned to the owner,
+ * waiting or frozen list. */
+ lock->refs_pending--;
if (lock->release) {
ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
- list_add_tail(&link->wait_list, &lock->frozen);
+ /* When lock->release is set, we'll unlock the lock as soon as
+ * possible, meaning that we won't use a timer. */
+ GF_ASSERT(lock->timer == NULL);
- /* The lock is frozen, so we move the current reference to refs_frozen.
- * After that, there should remain at least one ref belonging to the
- * lock that is processing the release. */
- lock->refs--;
- GF_ASSERT(lock->refs > 0);
- lock->refs_frozen++;
+ /* The lock is marked to be released. We can still have owners and fops
+ * in the waiting ilist f they have been added before the lock has been
+ * marked to be released. However new fops are put into the frozen list
+ * to wait for the next unlock/lock cycle. */
+ list_add_tail(&link->wait_list, &lock->frozen);
goto unlock;
}
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ if (lock->timer != NULL) {
+ /* We are trying to acquire a lock that has an unlock timer active.
+ * This means that the lock must be idle, i.e. no fop can be in the
+ * owner, waiting or frozen lists. It also means that the lock cannot
+ * have been marked as being released (this is done without timers)
+ * and it must not be exclusive. There should only be one owner
+ * reference, but it's possible that some fops are being prepared to
+ * use this lock. */
+ GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
+ list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We take the timer_link before cancelling the timer, since a
+ * successful cancellation will destroy it. It must not be NULL
+ * because it references the fop responsible for the delayed unlock
+ * that we are currently trying to cancel. */
+ timer_link = lock->timer->data;
+ GF_ASSERT(timer_link != NULL);
+
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+
+ /* We have two options here:
+ *
+ * 1. The timer has been successfully cancelled.
+ *
+ * This is the easiest case and we can continue with the currently
+ * acquired lock.
+ *
+ * 2. The timer callback has already been fired.
+ *
+ * In this case we have not been able to cancel the timer before
+ * the timer callback has been fired, but we also know that
+ * lock->timer != NULL. This means that the timer callback is still
+ * trying to acquire the inode mutex that we currently own. We are
+ * safe until we release it. In this case we can safely clear
+ * lock->timer. This will cause that the timer callback does nothing
+ * once it acquires the mutex.
+ *
+ * In both cases we must release the owner reference assigned to the
+ * fop that was handling the unlock because ec_unlock_now() won't be
+ * called for that fop.
+ */
+ lock->timer = NULL;
+ lock->refs_owners--;
+ }
+
lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
if (!list_empty(&lock->owners)) {
+ /* There are other owners of this lock. We can only take ownership if
+ * the lock is already acquired and can be shared. Otherwise we need
+ * to wait. */
if (!lock->acquired || (lock->exclusive != 0)) {
ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
@@ -1513,36 +1573,24 @@ ec_lock_assign_owner(ec_lock_link_t *link)
goto unlock;
}
- } else if (lock->timer != NULL) {
- GF_ASSERT (lock->release == _gf_false);
-
- timer_link = lock->timer->data;
- if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) {
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
- lock->timer = NULL;
- lock->refs--;
- /* There should remain at least 1 ref, the current one. */
- GF_ASSERT(lock->refs > 0);
- } else {
- /* Timer expired and on the way to unlock.
- * Set lock->release to _gf_true, so that this
- * lock will be put in frozen list*/
- timer_link = NULL;
- lock->release = _gf_true;
- }
}
list_add_tail(&link->owner_list, &lock->owners);
+ lock->refs_owners++;
assigned = _gf_true;
unlock:
if (!assigned) {
+ /* We have not been able to take ownership of this lock. The fop must
+ * be put to sleep. */
ec_sleep(fop);
}
UNLOCK(&lock->loc.inode->lock);
+ /* If we have cancelled the timer, we need to resume the fop that was
+ * waiting for it. */
if (timer_link != NULL) {
ec_resume(timer_link->fop, 0);
}
@@ -1566,8 +1614,14 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
ec_trace("LOCK_DONE", fop, "lock=%p", lock);
- GF_ASSERT(!list_empty(&link->owner_list));
+ /* Current link must belong to the owner list of the lock. We don't
+ * decrement lock->refs_owners here because the inode mutex is released
+ * before ec_unlock() is called and we need to know when the last owner
+ * unlocks the lock to do proper cleanup. lock->refs_owners is used for
+ * this task. */
+ GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
list_del_init(&link->owner_list);
+
lock->release |= release;
if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
@@ -1625,6 +1679,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
{
struct list_head list;
ec_lock_t *lock;
+ gf_boolean_t destroy = _gf_false;
lock = link->lock;
@@ -1632,18 +1687,30 @@ ec_lock_unfreeze(ec_lock_link_t *link)
LOCK(&lock->loc.inode->lock);
- lock->acquired = _gf_false;
+ /* The lock must be marked to be released here, since we have just released
+ * it and any attempt to assign it to more fops must have added them to the
+ * frozen list. We can only have one active reference here: the one that
+ * is processing this unfreeze. */
+ GF_ASSERT(lock->release && (lock->refs_owners == 1));
lock->release = _gf_false;
- lock->refs--;
+ lock->refs_owners = 0;
- GF_ASSERT (lock->refs == lock->inserted);
- GF_ASSERT(lock->exclusive == 0);
- GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners));
+ lock->acquired = _gf_false;
+
+ /* We are unfreezing a lock. This means that the lock has already been
+ * released. In this state it shouldn't be exclusive nor have a pending
+ * timer nor have any owner, and the waiting list should be empty. Only
+ * the frozen list can contain some fop. */
+ GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
+ list_empty(&lock->waiting) && list_empty(&lock->owners));
+ /* We move all frozen fops to the waiting list. */
list_splice_init(&lock->frozen, &lock->waiting);
- lock->refs += lock->refs_frozen;
- lock->refs_frozen = 0;
- if (lock->refs == 0) {
+
+ /* If we don't have any fop waiting nor there are any prepared fops using
+ * this lock, we can finally dispose it. */
+ destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
+ if (destroy) {
ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
lock->ctx->inode_lock = NULL;
@@ -1657,7 +1724,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
ec_lock_resume_shared(&list);
- if (lock->refs == 0) {
+ if (destroy) {
ec_lock_destroy(lock);
}
}
@@ -1770,9 +1837,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
fop = link->fop;
- GF_ASSERT(version[0] < 0x100000000);
- GF_ASSERT(version[1] < 0x100000000);
-
ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
version[0], version[1], size, dirty[0], dirty[1]);
@@ -1814,7 +1878,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
}
}
- /* If config information is not know, we request it now. */
+ /* If config information is not known, we request it now. */
if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
/* A failure requesting this xattr is ignored because it's not
* absolutely required right now. */
@@ -1850,6 +1914,13 @@ out:
gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
"Unable to update version and size");
+
+ if ((fop->parent->id != GF_FOP_FLUSH) &&
+ (fop->parent->id != GF_FOP_FSYNC) &&
+ (fop->parent->id != GF_FOP_FSYNCDIR)) {
+ ec_unlock_lock(fop->data);
+ }
+
}
gf_boolean_t
@@ -1900,7 +1971,6 @@ ec_unlock_now(ec_lock_link_t *link)
void
ec_unlock_timer_del(ec_lock_link_t *link)
{
- int32_t before = 0;
ec_lock_t *lock;
inode_t *inode;
gf_boolean_t now = _gf_false;
@@ -1922,22 +1992,24 @@ ec_unlock_timer_del(ec_lock_link_t *link)
if (lock->timer != NULL) {
ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
+ /* The unlock timer has expired without anyone cancelling it.
+ * This means that it shouldn't have any owner, and the
+ * waiting and frozen lists should be empty. It shouldn't have
+ * been marked as release nor be exclusive either. It must have
+ * only one owner reference, but there can be fops being
+ * prepared though. */
+ GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
+ (lock->refs_owners == 1) &&
+ list_empty(&lock->owners) &&
+ list_empty(&lock->waiting) &&
+ list_empty(&lock->frozen));
+
gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
lock->timer = NULL;
+ /* Any fop being processed from now on, will need to wait
+ * until the next unlock/lock cycle. */
lock->release = now = _gf_true;
-
- /* TODO: If the assertion is really true, following code is
- * not needed. */
- GF_ASSERT(list_empty(&lock->waiting));
-
- before = lock->refs + lock->refs_frozen;
- list_splice_init(&lock->waiting, &lock->frozen);
- lock->refs_frozen += lock->refs - lock->inserted - 1;
- lock->refs = 1 + lock->inserted;
- /* We moved around the locks, so total number of locks shouldn't
- * change by this operation*/
- GF_ASSERT (before == (lock->refs + lock->refs_frozen));
}
UNLOCK(&inode->lock);
@@ -1961,24 +2033,50 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
LOCK(&lock->loc.inode->lock);
- GF_ASSERT(lock->timer == NULL);
+ /* We are trying to unlock the lock. We can have multiple scenarios here,
+ * but all of them need to have lock->timer == NULL:
+ *
+ * 1. There are other owners currently running that can call ec_unlock().
+ *
+ * None of them can have started the timer until the last one. But this
+ * call should be the consequence of this lastest one.
+ *
+ * 2. There are fops in the waiting or frozen lists.
+ *
+ * These fops cannot call ec_unlock(). So we should be here.
+ *
+ * We must reach here with at least one owner reference.
+ */
+ GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
- if ((lock->refs - lock->inserted) > 1) {
+ /* If the fop detects that a heal is needed, we mark the lock to be
+ * released as soon as possible. */
+ lock->release |= ec_fop_needs_heal(fop);
+
+ if (lock->refs_owners > 1) {
ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
- lock->refs--;
+ /* If there are other owners we cannot do anything else with the lock.
+ * Note that the current fop has already been removed from the owners
+ * list in ec_lock_reuse(). */
+ lock->refs_owners--;
UNLOCK(&lock->loc.inode->lock);
} else if (lock->acquired) {
- ec_t *ec = fop->xl->private;
+ /* There are no other owners and the lock is acquired. If there were
+ * fops waiting, at least one of them should have been promoted to an
+ * owner, so the waiting list should be empty. */
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
- GF_ASSERT(list_empty(&lock->owners));
+ ec_t *ec = fop->xl->private;
+ /* If everything goes as expected this fop will be put to sleep until
+ * the timer callback is executed. */
ec_sleep(fop);
- /* If healing is needed, the lock needs to be released due to
- * contention, or ec is shutting down, do not delay lock release. */
- if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) {
+ /* If the lock needs to be released, or ec is shutting down, do not
+ * delay lock release. */
+ if (!lock->release && !ec->shutdown) {
ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
lock->release);
@@ -1989,9 +2087,10 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
if (lock->timer == NULL) {
gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
EC_MSG_UNLOCK_DELAY_FAILED,
- "Unable to delay an "
- "unlock");
+ "Unable to delay an unlock");
+ /* We are unable to create a new timer. We immediately release
+ * the lock. */
lock->release = now = _gf_true;
}
} else {
@@ -2006,10 +2105,17 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
ec_unlock_now(link);
}
} else {
+ /* There are no owners and the lock is not acquired. This can only
+ * happen if a lock attempt has failed and we get to the unlock step
+ * of the fop. As in the previous case, the waiting list must be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We need to mark the lock to be released to correctly handle fops
+ * that may get in after we release the inode mutex but before
+ * ec_lock_unfreeze() is processed. */
lock->release = _gf_true;
- GF_ASSERT(list_empty(&lock->owners));
-
UNLOCK(&lock->loc.inode->lock);
ec_lock_unfreeze(link);
@@ -2052,7 +2158,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
}
}
} else {
- /* If eager lock is disabled or If we haven't get
+ /* If eager lock is disabled or if we haven't get
* an answer with enough quorum, we always release
* the lock. */
release = _gf_true;
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 9107b4b156e..f4214ecfed7 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -139,17 +139,28 @@ struct _ec_lock
{
ec_inode_t *ctx;
gf_timer_t *timer;
- struct list_head owners; /* List of owners of this lock. */
- struct list_head waiting; /* Queue of requests being serviced. */
- struct list_head frozen; /* Queue of requests that will be serviced in
- the next unlock/lock cycle. */
+
+ /* List of owners of this lock. All fops added to this list are running
+ * concurrently. */
+ struct list_head owners;
+
+ /* List of fops waiting to be an owner of the lock. Fops are added to this
+ * list when the current owner has an incompatible access (shared vs
+ * exclusive) or the lock is not acquired yet. */
+ struct list_head waiting;
+
+ /* List of fops that will wait until the next unlock/lock cycle. This
+ * happens when the currently acquired lock is decided to be released as
+ * soon as possible. In this case, all frozen fops will be continued only
+ * after the lock is reacquired. */
+ struct list_head frozen;
+
int32_t exclusive;
uintptr_t mask;
uintptr_t good_mask;
uintptr_t healing;
- int32_t refs;
- int32_t refs_frozen;
- int32_t inserted;
+ uint32_t refs_owners; /* Refs for fops owning the lock */
+ uint32_t refs_pending; /* Refs assigned to fops being prepared */
gf_boolean_t acquired;
gf_boolean_t getting_size;
gf_boolean_t release;