1 files changed, 166 insertions, 87 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index cb627a92c9c..fbc7ac97aa0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
     return _gf_true;
 }
 
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+        ec_lock_link_t *timer_link;
+
+        /* If we don't have any timer, there's nothing to cancel. */
+        if (lock->timer == NULL) {
+                return NULL;
+        }
+
+        /* We are trying to access a lock that has an unlock timer active.
+         * This means that the lock must be idle, i.e. no fop can be in the
+         * owner, waiting or frozen lists. It also means that the lock cannot
+         * have been marked as being released (this is done without timers).
+         * There should only be one owner reference, but it's possible that
+         * some fops are being prepared to use this lock. */
+        GF_ASSERT ((lock->refs_owners == 1) &&
+                   list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We take the timer_link before cancelling the timer, since a
+         * successful cancellation will destroy it. It must not be NULL
+         * because it references the fop responsible for the delayed unlock
+         * that we are currently trying to cancel. */
+        timer_link = lock->timer->data;
+        GF_ASSERT(timer_link != NULL);
+
+        if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+                /* It's too late to avoid the execution of the timer callback.
+                 * Since we need to be sure that the callback has access to all
+                 * needed resources, we cannot resume the execution of the
+                 * timer fop now. This will be done in the callback. */
+                timer_link = NULL;
+        } else {
+                /* The timer has been cancelled. The fop referenced by
+                 * timer_link holds the last reference. The caller is
+                 * responsible to release it when not needed anymore. */
+                ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+        }
+
+        /* We have two options here:
+         *
+         * 1. The timer has been successfully cancelled.
+         *
+         *    This is the easiest case and we can continue with the currently
+         *    acquired lock.
+         *
+         * 2. The timer callback has already been fired.
+         *
+         *    In this case we have not been able to cancel the timer before
+         *    the timer callback has been fired, but we also know that
+         *    lock->timer != NULL. This means that the timer callback is still
+         *    trying to acquire the inode mutex that we currently own. We are
+         *    safe until we release it. In this case we can safely clear
+         *    lock->timer. This will cause that the timer callback does nothing
+         *    once it acquires the mutex.
+         */
+        lock->timer = NULL;
+
+        return timer_link;
+}
+
 static gf_boolean_t
 ec_lock_assign_owner(ec_lock_link_t *link)
 {
@@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)
      * empty. */
     GF_ASSERT(list_empty(&lock->frozen));
 
-    if (lock->timer != NULL) {
-        /* We are trying to acquire a lock that has an unlock timer active.
-         * This means that the lock must be idle, i.e. no fop can be in the
-         * owner, waiting or frozen lists. It also means that the lock cannot
-         * have been marked as being released (this is done without timers).
-         * There should only be one owner reference, but it's possible that
-         * some fops are being prepared to use this lock.
-         */
-        GF_ASSERT ((lock->refs_owners == 1) &&
-                   list_empty(&lock->owners) && list_empty(&lock->waiting));
-
-        /* We take the timer_link before cancelling the timer, since a
-         * successful cancellation will destroy it. It must not be NULL
-         * because it references the fop responsible for the delayed unlock
-         * that we are currently trying to cancel. */
-        timer_link = lock->timer->data;
-        GF_ASSERT(timer_link != NULL);
-
-        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
-            /* It's too late to avoid the execution of the timer callback.
-             * Since we need to be sure that the callback has access to all
-             * needed resources, we cannot resume the execution of the timer
-             * fop now. This will be done in the callback.
-             */
-            timer_link = NULL;
-        } else {
-            /* The timer has been cancelled, so we need to release the owner
-             * reference that was held by the fop waiting for the timer. This
-             * can be the last reference, but we'll immediately increment it
-             * for the current fop, so no need to check it.
-             */
-            lock->refs_owners--;
-
-            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
-        }
-
-        /* We have two options here:
-         *
-         * 1. The timer has been successfully cancelled.
-         *
-         *    This is the easiest case and we can continue with the currently
-         *    acquired lock.
-         *
-         * 2. The timer callback has already been fired.
-         *
-         *    In this case we have not been able to cancel the timer before
-         *    the timer callback has been fired, but we also know that
-         *    lock->timer != NULL. This means that the timer callback is still
-         *    trying to acquire the inode mutex that we currently own. We are
-         *    safe until we release it. In this case we can safely clear
-         *    lock->timer. This will cause that the timer callback does nothing
-         *    once it acquires the mutex.
-         */
-        lock->timer = NULL;
-    }
+    timer_link = ec_lock_timer_cancel(fop->xl, lock);
 
     if (!list_empty(&lock->owners)) {
         /* There are other owners of this lock. We can only take ownership if
@@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)
     }
 
     list_add_tail(&link->owner_list, &lock->owners);
-    lock->refs_owners++;
+
+    /* If timer_link is not NULL, it means that we have inherited the owner
+     * reference assigned to the timer fop. In this case we simply reuse it.
+     * Otherwise we need to increase the number of owners. */
+    if (timer_link == NULL) {
+            lock->refs_owners++;
+    }
 
     assigned = _gf_true;
 
@@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)
     ec_resume(link->fop, 0);
 }
 
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+        ec_lock_t *lock;
+        ec_inode_t *ctx;
+        ec_lock_link_t *timer_link = NULL;
+
+        LOCK(&inode->lock);
+
+        ctx = __ec_inode_get(inode, ec->xl);
+        if (ctx == NULL) {
+                goto done;
+        }
+        lock = ctx->inode_lock;
+        if ((lock == NULL) || !lock->acquired || lock->release) {
+                goto done;
+        }
+
+        gf_msg_debug(ec->xl->name, 0,
+                     "Releasing inode %p due to lock contention", inode);
+
+        /* The lock is not marked to be released, so the frozen list should be
+         * empty. */
+        GF_ASSERT(list_empty(&lock->frozen));
+
+        timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+        /* We mark the lock to be released as soon as possible. */
+        lock->release = _gf_true;
+
+done:
+        UNLOCK(&inode->lock);
+
+        /* If we have cancelled the timer, we need to start the unlock of the
+         * inode. If there was a timer but we have been unable to cancel it
+         * because it was just triggered, the timer callback will take care
+         * of releasing the inode. */
+        if (timer_link != NULL) {
+                ec_unlock_now(timer_link);
+        }
+}
+
 void ec_unlock_timer_add(ec_lock_link_t *link);
 
 void
@@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)
         ec_unlock_timer_del(data);
 }
 
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+        /* Fops with no locks at this point mean that they are sent as sub-fops
+         * of other higher level fops. In this case we simply assume that the
+         * parent fop will take correct care of the eager lock. */
+        if (fop->lock_count == 0) {
+                return _gf_true;
+        }
+
+        /* We may have more than one lock, but this only happens in the rename
+         * fop, and both locks will reference an inode of the same type (a
+         * directory in this case), so we only need to check the first lock. */
+        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+                return ec->eager_lock;
+        }
+
+        return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+        if (lock->loc.inode->ia_type == IA_IFREG) {
+                return ec->eager_lock_timeout;
+        }
+
+        return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+        struct timespec delay;
+        ec_fop_data_t *fop = link->fop;
+        ec_lock_t *lock = link->lock;
+
+        delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+        delay.tv_nsec = 0;
+        lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+                                          ec_unlock_timer_cbk, link);
+        if (lock->timer == NULL) {
+                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+                       EC_MSG_UNLOCK_DELAY_FAILED,
+                       "Unable to delay an unlock");
+
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
 void ec_unlock_timer_add(ec_lock_link_t *link)
 {
-    struct timespec delay;
     ec_fop_data_t *fop = link->fop;
     ec_lock_t *lock = link->lock;
     gf_boolean_t now = _gf_false;
@@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
             ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
                      lock->release);
 
-            delay.tv_sec = 1;
-            delay.tv_nsec = 0;
-            lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
-                                              ec_unlock_timer_cbk, link);
-            if (lock->timer == NULL) {
-                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
-                       EC_MSG_UNLOCK_DELAY_FAILED,
-                       "Unable to delay an unlock");
-
+            if (!ec_lock_delay_create(link)) {
                 /* We are unable to create a new timer. We immediately release
                  * the lock. */
                 lock->release = now = _gf_true;
             }
+
         } else {
             ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
                      lock->release);
@@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)
     ec_update_info(&fop->locks[0]);
 }
 
-static gf_boolean_t
-ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
-{
-        /* Fops with no locks at this point mean that they are sent as sub-fops
-         * of other higher level fops. In this case we simply assume that the
-         * parent fop will take correct care of the eager lock. */
-        if (fop->lock_count == 0) {
-                return _gf_true;
-        }
-
-        /* We may have more than one lock, but this only happens in the rename
-         * fop, and both locks will reference an inode of the same type (a
-         * directory in this case), so we only need to check the first lock. */
-        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
-                return ec->eager_lock;
-        }
-
-        return ec->other_eager_lock;
-}
-
 static void
 ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
                  ec_fop_data_t *fop)
@@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
     ec = fop->xl->private;
     cbk = fop->answer;
 
-    if (ec_use_eager_lock(ec, fop) && cbk != NULL) {
+    if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
         if (cbk->xdata != NULL) {
             if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
                                 &count) == 0) && (count > 1)) {