cluster/ec: Fix issues with eager locking

Due to a race in timer cancellation, in some cases it was possible to unlock the lock while another concurrent fop that needed it continues execution as if it were not released. This patch also fixes an issue that caused a lock to not be released if an error was found while preparing ec_update_size_version(). > Change-Id: I1344a3f5ecfc333f05a09e62653838264c9c26b1 > BUG: 1331254 > Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> > Reviewed-on: http://review.gluster.org/14112 > Smoke: Gluster Build System <jenkins@build.gluster.com> > CentOS-regression: Gluster Build System <jenkins@build.gluster.com> > Reviewed-by: Chen Chen <chenchen@smartquerier.com> > NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Change-Id: I21edd17d914dfa8d2f98e6bbde50830496e12a92 BUG: 1330132 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/14174 Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
author: Xavier Hernandez <xhernandez@datalab.es> 2016-04-28 08:42:40 +0200
committer: Jeff Darcy <jdarcy@redhat.com> 2016-05-04 04:29:04 -0700
commit: 6e1de9e46b12b25d27d852d3cccadc51768e1150 (patch)
tree: e0152e58dfa39044e43193ab3b3f2d0eb6a0034f /xlators
parent: 5cd344b610c814df8375db7c3144df97766b47bd (diff)
2 files changed, 192 insertions, 75 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index de0e597d124..58cfc732ced 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -711,8 +711,7 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
     link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
     link->base = base;
 
-    lock->refs++;
-    lock->inserted++;
+    lock->refs_pending++;
 }
 
 void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
@@ -1347,6 +1346,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
         list_move_tail(&link->wait_list, list);
 
         list_add_tail(&link->owner_list, &lock->owners);
+        lock->refs_owners++;
 
         ec_lock_update_fd(lock, fop);
     }
@@ -1478,6 +1478,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
     ec_lock_link_t *timer_link = NULL;
     gf_boolean_t assigned = _gf_false;
 
+    /* The link cannot be in any list because we have just finished preparing
+     * it. */
     GF_ASSERT(list_empty(&link->wait_list));
 
     fop = link->fop;
@@ -1485,27 +1487,85 @@ ec_lock_assign_owner(ec_lock_link_t *link)
 
     LOCK(&lock->loc.inode->lock);
 
-    GF_ASSERT (lock->inserted > 0);
-    lock->inserted--;
+    /* Since the link has just been prepared but it's not active yet, the
+     * refs_pending must be one at least (the ref owned by this link). */
+    GF_ASSERT (lock->refs_pending > 0);
+    /* The link is not pending any more. It will be assigned to the owner,
+     * waiting or frozen list. */
+    lock->refs_pending--;
 
     if (lock->release) {
         ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
 
-        list_add_tail(&link->wait_list, &lock->frozen);
+        /* When lock->release is set, we'll unlock the lock as soon as
+         * possible, meaning that we won't use a timer. */
+        GF_ASSERT(lock->timer == NULL);
 
-        /* The lock is frozen, so we move the current reference to refs_frozen.
-         * After that, there should remain at least one ref belonging to the
-         * lock that is processing the release. */
-        lock->refs--;
-        GF_ASSERT(lock->refs > 0);
-        lock->refs_frozen++;
+        /* The lock is marked to be released. We can still have owners and fops
+         * in the waiting ilist f they have been added before the lock has been
+         * marked to be released. However new fops are put into the frozen list
+         * to wait for the next unlock/lock cycle. */
+        list_add_tail(&link->wait_list, &lock->frozen);
 
         goto unlock;
     }
 
+    /* The lock is not marked to be released, so the frozen list should be
+     * empty. */
+    GF_ASSERT(list_empty(&lock->frozen));
+
+    if (lock->timer != NULL) {
+        /* We are trying to acquire a lock that has an unlock timer active.
+         * This means that the lock must be idle, i.e. no fop can be in the
+         * owner, waiting or frozen lists. It also means that the lock cannot
+         * have been marked as being released (this is done without timers)
+         * and it must not be exclusive. There should only be one owner
+         * reference, but it's possible that some fops are being prepared to
+         * use this lock. */
+        GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
+                   list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We take the timer_link before cancelling the timer, since a
+         * successful cancellation will destroy it. It must not be NULL
+         * because it references the fop responsible for the delayed unlock
+         * that we are currently trying to cancel. */
+        timer_link = lock->timer->data;
+        GF_ASSERT(timer_link != NULL);
+
+        gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+        ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+
+        /* We have two options here:
+         *
+         * 1. The timer has been successfully cancelled.
+         *
+         *    This is the easiest case and we can continue with the currently
+         *    acquired lock.
+         *
+         * 2. The timer callback has already been fired.
+         *
+         *    In this case we have not been able to cancel the timer before
+         *    the timer callback has been fired, but we also know that
+         *    lock->timer != NULL. This means that the timer callback is still
+         *    trying to acquire the inode mutex that we currently own. We are
+         *    safe until we release it. In this case we can safely clear
+         *    lock->timer. This will cause that the timer callback does nothing
+         *    once it acquires the mutex.
+         *
+         * In both cases we must release the owner reference assigned to the
+         * fop that was handling the unlock because ec_unlock_now() won't be
+         * called for that fop.
+         */
+        lock->timer = NULL;
+        lock->refs_owners--;
+    }
+
     lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
 
     if (!list_empty(&lock->owners)) {
+        /* There are other owners of this lock. We can only take ownership if
+         * the lock is already acquired and can be shared. Otherwise we need
+         * to wait. */
         if (!lock->acquired || (lock->exclusive != 0)) {
             ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
 
@@ -1513,36 +1573,24 @@ ec_lock_assign_owner(ec_lock_link_t *link)
 
             goto unlock;
         }
-    } else if (lock->timer != NULL) {
-        GF_ASSERT (lock->release == _gf_false);
-
-        timer_link = lock->timer->data;
-        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) {
-            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
-            lock->timer = NULL;
-            lock->refs--;
-            /* There should remain at least 1 ref, the current one. */
-            GF_ASSERT(lock->refs > 0);
-        } else {
-            /* Timer expired and on the way to unlock.
-             * Set lock->release to _gf_true, so that this
-             * lock will be put in frozen list*/
-            timer_link = NULL;
-            lock->release = _gf_true;
-        }
     }
 
     list_add_tail(&link->owner_list, &lock->owners);
+    lock->refs_owners++;
 
     assigned = _gf_true;
 
 unlock:
     if (!assigned) {
+        /* We have not been able to take ownership of this lock. The fop must
+         * be put to sleep. */
         ec_sleep(fop);
     }
 
     UNLOCK(&lock->loc.inode->lock);
 
+    /* If we have cancelled the timer, we need to resume the fop that was
+     * waiting for it. */
     if (timer_link != NULL) {
         ec_resume(timer_link->fop, 0);
     }
@@ -1566,8 +1614,14 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
 
     ec_trace("LOCK_DONE", fop, "lock=%p", lock);
 
-    GF_ASSERT(!list_empty(&link->owner_list));
+    /* Current link must belong to the owner list of the lock. We don't
+     * decrement lock->refs_owners here because the inode mutex is released
+     * before ec_unlock() is called and we need to know when the last owner
+     * unlocks the lock to do proper cleanup. lock->refs_owners is used for
+     * this task. */
+    GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
     list_del_init(&link->owner_list);
+
     lock->release |= release;
 
     if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
@@ -1625,6 +1679,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 {
     struct list_head list;
     ec_lock_t *lock;
+    gf_boolean_t destroy = _gf_false;
 
     lock = link->lock;
 
@@ -1632,18 +1687,30 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 
     LOCK(&lock->loc.inode->lock);
 
-    lock->acquired = _gf_false;
+    /* The lock must be marked to be released here, since we have just released
+     * it and any attempt to assign it to more fops must have added them to the
+     * frozen list. We can only have one active reference here: the one that
+     * is processing this unfreeze. */
+    GF_ASSERT(lock->release && (lock->refs_owners == 1));
     lock->release = _gf_false;
-    lock->refs--;
+    lock->refs_owners = 0;
 
-    GF_ASSERT (lock->refs == lock->inserted);
-    GF_ASSERT(lock->exclusive == 0);
-    GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners));
+    lock->acquired = _gf_false;
+
+    /* We are unfreezing a lock. This means that the lock has already been
+     * released. In this state it shouldn't be exclusive nor have a pending
+     * timer nor have any owner, and the waiting list should be empty. Only
+     * the frozen list can contain some fop. */
+    GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
+              list_empty(&lock->waiting) && list_empty(&lock->owners));
 
+    /* We move all frozen fops to the waiting list. */
     list_splice_init(&lock->frozen, &lock->waiting);
-    lock->refs += lock->refs_frozen;
-    lock->refs_frozen = 0;
-    if (lock->refs == 0) {
+
+    /* If we don't have any fop waiting nor there are any prepared fops using
+     * this lock, we can finally dispose it. */
+    destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
+    if (destroy) {
         ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
 
         lock->ctx->inode_lock = NULL;
@@ -1657,7 +1724,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 
     ec_lock_resume_shared(&list);
 
-    if (lock->refs == 0) {
+    if (destroy) {
         ec_lock_destroy(lock);
     }
 }
@@ -1770,9 +1837,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
 
     fop = link->fop;
 
-    GF_ASSERT(version[0] < 0x100000000);
-    GF_ASSERT(version[1] < 0x100000000);
-
     ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
              version[0], version[1], size, dirty[0], dirty[1]);
 
@@ -1814,7 +1878,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
         }
     }
 
-    /* If config information is not know, we request it now. */
+    /* If config information is not known, we request it now. */
     if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
         /* A failure requesting this xattr is ignored because it's not
          * absolutely required right now. */
@@ -1850,6 +1914,13 @@ out:
 
     gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
             "Unable to update version and size");
+
+    if ((fop->parent->id != GF_FOP_FLUSH) &&
+        (fop->parent->id != GF_FOP_FSYNC) &&
+        (fop->parent->id != GF_FOP_FSYNCDIR)) {
+        ec_unlock_lock(fop->data);
+    }
+
 }
 
 gf_boolean_t
@@ -1900,7 +1971,6 @@ ec_unlock_now(ec_lock_link_t *link)
 void
 ec_unlock_timer_del(ec_lock_link_t *link)
 {
-        int32_t before = 0;
         ec_lock_t *lock;
         inode_t *inode;
         gf_boolean_t now = _gf_false;
@@ -1922,22 +1992,24 @@ ec_unlock_timer_del(ec_lock_link_t *link)
         if (lock->timer != NULL) {
                 ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
 
+                /* The unlock timer has expired without anyone cancelling it.
+                 * This means that it shouldn't have any owner, and the
+                 * waiting and frozen lists should be empty. It shouldn't have
+                 * been marked as release nor be exclusive either. It must have
+                 * only one owner reference, but there can be fops being
+                 * prepared though. */
+                GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
+                          (lock->refs_owners == 1) &&
+                          list_empty(&lock->owners) &&
+                          list_empty(&lock->waiting) &&
+                          list_empty(&lock->frozen));
+
                 gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
                 lock->timer = NULL;
 
+                /* Any fop being processed from now on, will need to wait
+                 * until the next unlock/lock cycle. */
                 lock->release = now = _gf_true;
-
-                /* TODO: If the assertion is really true, following code is
-                 *       not needed. */
-                GF_ASSERT(list_empty(&lock->waiting));
-
-                before = lock->refs + lock->refs_frozen;
-                list_splice_init(&lock->waiting, &lock->frozen);
-                lock->refs_frozen += lock->refs - lock->inserted - 1;
-                lock->refs = 1 + lock->inserted;
-                /* We moved around the locks, so total number of locks shouldn't
-                 * change by this operation*/
-                GF_ASSERT (before == (lock->refs + lock->refs_frozen));
         }
 
         UNLOCK(&inode->lock);
@@ -1961,24 +2033,50 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
 
     LOCK(&lock->loc.inode->lock);
 
-    GF_ASSERT(lock->timer == NULL);
+    /* We are trying to unlock the lock. We can have multiple scenarios here,
+     * but all of them need to have lock->timer == NULL:
+     *
+     * 1. There are other owners currently running that can call ec_unlock().
+     *
+     *    None of them can have started the timer until the last one. But this
+     *    call should be the consequence of this lastest one.
+     *
+     * 2. There are fops in the waiting or frozen lists.
+     *
+     *    These fops cannot call ec_unlock(). So we should be here.
+     *
+     * We must reach here with at least one owner reference.
+     */
+    GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
 
-    if ((lock->refs - lock->inserted) > 1) {
+    /* If the fop detects that a heal is needed, we mark the lock to be
+     * released as soon as possible. */
+    lock->release |= ec_fop_needs_heal(fop);
+
+    if (lock->refs_owners > 1) {
         ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
 
-        lock->refs--;
+        /* If there are other owners we cannot do anything else with the lock.
+         * Note that the current fop has already been removed from the owners
+         * list in ec_lock_reuse(). */
+        lock->refs_owners--;
 
         UNLOCK(&lock->loc.inode->lock);
     } else if (lock->acquired) {
-        ec_t *ec = fop->xl->private;
+        /* There are no other owners and the lock is acquired. If there were
+         * fops waiting, at least one of them should have been promoted to an
+         * owner, so the waiting list should be empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
 
-        GF_ASSERT(list_empty(&lock->owners));
+        ec_t *ec = fop->xl->private;
 
+        /* If everything goes as expected this fop will be put to sleep until
+         * the timer callback is executed. */
         ec_sleep(fop);
 
-        /* If healing is needed, the lock needs to be released due to
-         * contention, or ec is shutting down, do not delay lock release. */
-        if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) {
+        /* If the lock needs to be released, or ec is shutting down, do not
+         * delay lock release. */
+        if (!lock->release && !ec->shutdown) {
             ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
                      lock->release);
 
@@ -1989,9 +2087,10 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
             if (lock->timer == NULL) {
                 gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
                        EC_MSG_UNLOCK_DELAY_FAILED,
-                       "Unable to delay an "
-                       "unlock");
+                       "Unable to delay an unlock");
 
+                /* We are unable to create a new timer. We immediately release
+                 * the lock. */
                 lock->release = now = _gf_true;
             }
         } else {
@@ -2006,10 +2105,17 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
             ec_unlock_now(link);
         }
     } else {
+        /* There are no owners and the lock is not acquired. This can only
+         * happen if a lock attempt has failed and we get to the unlock step
+         * of the fop. As in the previous case, the waiting list must be
+         * empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We need to mark the lock to be released to correctly handle fops
+         * that may get in after we release the inode mutex but before
+         * ec_lock_unfreeze() is processed. */
         lock->release = _gf_true;
 
-        GF_ASSERT(list_empty(&lock->owners));
-
         UNLOCK(&lock->loc.inode->lock);
 
         ec_lock_unfreeze(link);
@@ -2052,7 +2158,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
             }
         }
     } else {
-        /* If eager lock is disabled or If we haven't get
+        /* If eager lock is disabled or if we haven't get
          * an answer with enough quorum, we always release
          * the lock. */
         release = _gf_true;
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 9107b4b156e..f4214ecfed7 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -139,17 +139,28 @@ struct _ec_lock
 {
     ec_inode_t        *ctx;
     gf_timer_t        *timer;
-    struct list_head   owners;  /* List of owners of this lock. */
-    struct list_head   waiting; /* Queue of requests being serviced. */
-    struct list_head   frozen;  /* Queue of requests that will be serviced in
-                                   the next unlock/lock cycle. */
+
+    /* List of owners of this lock. All fops added to this list are running
+     * concurrently. */
+    struct list_head   owners;
+
+    /* List of fops waiting to be an owner of the lock. Fops are added to this
+     * list when the current owner has an incompatible access (shared vs
+     * exclusive) or the lock is not acquired yet. */
+    struct list_head   waiting;
+
+    /* List of fops that will wait until the next unlock/lock cycle. This
+     * happens when the currently acquired lock is decided to be released as
+     * soon as possible. In this case, all frozen fops will be continued only
+     * after the lock is reacquired. */
+    struct list_head   frozen;
+
     int32_t            exclusive;
     uintptr_t          mask;
     uintptr_t          good_mask;
     uintptr_t          healing;
-    int32_t            refs;
-    int32_t            refs_frozen;
-    int32_t            inserted;
+    uint32_t           refs_owners;  /* Refs for fops owning the lock */
+    uint32_t           refs_pending; /* Refs assigned to fops being prepared */
     gf_boolean_t       acquired;
     gf_boolean_t       getting_size;
     gf_boolean_t       release;
author	Xavier Hernandez <xhernandez@datalab.es>	2016-04-28 08:42:40 +0200
committer	Jeff Darcy <jdarcy@redhat.com>	2016-05-04 04:29:04 -0700
commit	6e1de9e46b12b25d27d852d3cccadc51768e1150 (patch)
tree	e0152e58dfa39044e43193ab3b3f2d0eb6a0034f /xlators
parent	5cd344b610c814df8375db7c3144df97766b47bd (diff)