1 files changed, 902 insertions, 743 deletions
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index e1702c78ba1..d4e51d6e0a1 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -7,128 +7,127 @@
    later), or the GNU General Public License, version 2 (GPLv2), in all
    cases as published by the Free Software Foundation.
 */
-#include "glusterfs.h"
-#include "compat.h"
-#include "xlator.h"
-#include "inode.h"
-#include "logging.h"
-#include "common-utils.h"
-#include "list.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/upcall-utils.h>
 
 #include "locks.h"
 #include "clear.h"
 #include "common.h"
 
 void
-__delete_inode_lock (pl_inode_lock_t *lock)
+__delete_inode_lock(pl_inode_lock_t *lock)
 {
-        list_del_init (&lock->list);
+    list_del_init(&lock->list);
 }
 
 static void
-__pl_inodelk_ref (pl_inode_lock_t *lock)
+__pl_inodelk_ref(pl_inode_lock_t *lock)
 {
-        lock->ref++;
+    lock->ref++;
 }
 
 void
-__pl_inodelk_unref (pl_inode_lock_t *lock)
+__pl_inodelk_unref(pl_inode_lock_t *lock)
 {
-        lock->ref--;
-        if (!lock->ref) {
-                GF_FREE (lock->connection_id);
-                GF_FREE (lock);
-        }
+    lock->ref--;
+    if (!lock->ref) {
+        GF_FREE(lock->connection_id);
+        GF_FREE(lock);
+    }
 }
 
-/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */
+/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't
+ * conflict */
 static int
-inodelk_type_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+inodelk_type_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
 {
-        if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK)
-                return 1;
+    if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK)
+        return 1;
 
-        return 0;
+    return 0;
 }
 
 void
-pl_print_inodelk (char *str, int size, int cmd, struct gf_flock *flock, const char *domain)
+pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock,
+                 const char *domain)
 {
-        char *cmd_str = NULL;
-        char *type_str = NULL;
+    char *cmd_str = NULL;
+    char *type_str = NULL;
 
-        switch (cmd) {
+    switch (cmd) {
 #if F_GETLK != F_GETLK64
         case F_GETLK64:
 #endif
         case F_GETLK:
-                cmd_str = "GETLK";
-                break;
+            cmd_str = "GETLK";
+            break;
 
 #if F_SETLK != F_SETLK64
         case F_SETLK64:
 #endif
         case F_SETLK:
-                cmd_str = "SETLK";
-                break;
+            cmd_str = "SETLK";
+            break;
 
 #if F_SETLKW != F_SETLKW64
         case F_SETLKW64:
 #endif
         case F_SETLKW:
-                cmd_str = "SETLKW";
-                break;
+            cmd_str = "SETLKW";
+            break;
 
         default:
-                cmd_str = "UNKNOWN";
-                break;
-        }
+            cmd_str = "UNKNOWN";
+            break;
+    }
 
-        switch (flock->l_type) {
+    switch (flock->l_type) {
         case F_RDLCK:
-                type_str = "READ";
-                break;
+            type_str = "READ";
+            break;
         case F_WRLCK:
-                type_str = "WRITE";
-                break;
+            type_str = "WRITE";
+            break;
         case F_UNLCK:
-                type_str = "UNLOCK";
-                break;
+            type_str = "UNLOCK";
+            break;
         default:
-                type_str = "UNKNOWN";
-                break;
-        }
-
-        snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, "
-                  "domain: %s, start=%llu, len=%llu, pid=%llu",
-                  cmd_str, type_str, domain,
-                  (unsigned long long) flock->l_start,
-                  (unsigned long long) flock->l_len,
-                  (unsigned long long) flock->l_pid);
+            type_str = "UNKNOWN";
+            break;
+    }
+
+    snprintf(str, size,
+             "lock=INODELK, cmd=%s, type=%s, "
+             "domain: %s, start=%llu, len=%llu, pid=%llu",
+             cmd_str, type_str, domain, (unsigned long long)flock->l_start,
+             (unsigned long long)flock->l_len,
+             (unsigned long long)flock->l_pid);
 }
 
 /* Determine if the two inodelks overlap reach other's lock regions */
 static int
-inodelk_overlap (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+inodelk_overlap(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
 {
-        return ((l1->fl_end >= l2->fl_start) &&
-                (l2->fl_end >= l1->fl_start));
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
 }
 
 /* Returns true if the 2 inodelks have the same owner */
 static int
-same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+same_inodelk_owner(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
 {
-        return (is_same_lkowner (&l1->owner, &l2->owner) &&
-                (l1->client == l2->client));
+    return (is_same_lkowner(&l1->owner, &l2->owner) &&
+            (l1->client == l2->client));
 }
 
 /* Returns true if the 2 inodelks conflict with each other */
 static int
-inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+inodelk_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
 {
-        return (inodelk_overlap (l1, l2) &&
-                inodelk_type_conflict (l1, l2));
+    return (inodelk_overlap(l1, l2) && inodelk_type_conflict(l1, l2));
 }
 
 /*
@@ -137,24 +136,21 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
  * true if it exceeds the configured threshold, false otherwise.
  */
 static inline gf_boolean_t
-__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock,
+__stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock,
                 pl_inode_lock_t *requested_lock, time_t *lock_age_sec)
 {
-        posix_locks_private_t  *priv = NULL;
-        struct timeval curr;
-
-        priv = this->private;
-        gettimeofday (&curr, NULL);
-        /* Question: Should we just prune them all given the
-         * chance?  Or just the locks we are attempting to acquire?
-         */
-        if (inodelk_conflict (candidate_lock, requested_lock)) {
-                *lock_age_sec = curr.tv_sec -
-                        candidate_lock->granted_time.tv_sec;
-                if (*lock_age_sec > priv->revocation_secs)
-                        return _gf_true;
-        }
-        return _gf_false;
+    posix_locks_private_t *priv = NULL;
+
+    priv = this->private;
+    /* Question: Should we just prune them all given the
+     * chance?  Or just the locks we are attempting to acquire?
+     */
+    if (inodelk_conflict(candidate_lock, requested_lock)) {
+        *lock_age_sec = gf_time() - candidate_lock->granted_time;
+        if (*lock_age_sec > priv->revocation_secs)
+            return _gf_true;
+    }
+    return _gf_false;
 }
 
 /* Examine any locks held on this inode and potentially revoke the lock
@@ -166,850 +162,1013 @@ __stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock,
  * are beyond a threshold.
  */
 static gf_boolean_t
-__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
-                       pl_inode_lock_t *lock)
-{
-        posix_locks_private_t  *priv = NULL;
-        pl_inode_lock_t *tmp = NULL;
-        pl_inode_lock_t *lk = NULL;
-        gf_boolean_t revoke_lock = _gf_false;
-        int bcount = 0;
-        int gcount = 0;
-        int op_errno = 0;
-        clrlk_args args;
-        args.opts = NULL;
-        time_t lk_age_sec = 0;
-        uint32_t max_blocked = 0;
-        char *reason_str = NULL;
-
-        priv = this->private;
-
-        args.type = CLRLK_INODE;
-        if (priv->revocation_clear_all == _gf_true)
-                args.kind = CLRLK_ALL;
-        else
-                args.kind = CLRLK_GRANTED;
-
-        if (list_empty (&dom->inodelk_list))
-                goto out;
-
-        pthread_mutex_lock (&pinode->mutex);
-        list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) {
-                if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) {
-                        revoke_lock = _gf_true;
-                        reason_str = "age";
-                        break;
-                }
+__inodelk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+                      pl_inode_lock_t *lock)
+{
+    posix_locks_private_t *priv = NULL;
+    pl_inode_lock_t *tmp = NULL;
+    pl_inode_lock_t *lk = NULL;
+    gf_boolean_t revoke_lock = _gf_false;
+    int bcount = 0;
+    int gcount = 0;
+    int op_errno = 0;
+    clrlk_args args;
+    args.opts = NULL;
+    time_t lk_age_sec = 0;
+    uint32_t max_blocked = 0;
+    char *reason_str = NULL;
+
+    priv = this->private;
+
+    args.type = CLRLK_INODE;
+    if (priv->revocation_clear_all == _gf_true)
+        args.kind = CLRLK_ALL;
+    else
+        args.kind = CLRLK_GRANTED;
+
+    if (list_empty(&dom->inodelk_list))
+        goto out;
+
+    pthread_mutex_lock(&pinode->mutex);
+    list_for_each_entry_safe(lk, tmp, &dom->inodelk_list, list)
+    {
+        if (__stale_inodelk(this, lk, lock, &lk_age_sec) == _gf_true) {
+            revoke_lock = _gf_true;
+            reason_str = "age";
+            break;
         }
+    }
 
-        max_blocked = priv->revocation_max_blocked;
-        if (max_blocked != 0 && revoke_lock == _gf_false) {
-                list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks,
-                                blocked_locks) {
-                        max_blocked--;
-                        if (max_blocked == 0) {
-                                revoke_lock = _gf_true;
-                                reason_str = "max blocked";
-                                break;
-                        }
-                }
+    max_blocked = priv->revocation_max_blocked;
+    if (max_blocked != 0 && revoke_lock == _gf_false) {
+        list_for_each_entry_safe(lk, tmp, &dom->blocked_inodelks, blocked_locks)
+        {
+            max_blocked--;
+            if (max_blocked == 0) {
+                revoke_lock = _gf_true;
+                reason_str = "max blocked";
+                break;
+            }
         }
-        pthread_mutex_unlock (&pinode->mutex);
+    }
+    pthread_mutex_unlock(&pinode->mutex);
 
 out:
-        if (revoke_lock == _gf_true) {
-                clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount,
-                        &op_errno);
-                gf_log (this->name, GF_LOG_WARNING,
-                        "Lock revocation [reason: %s; gfid: %s; domain: %s; "
-                        "age: %ld sec] - Inode lock revoked:  %d granted & %d "
-                        "blocked locks cleared",
-                        reason_str, uuid_utoa (pinode->gfid), dom->domain,
-                        lk_age_sec, gcount, bcount);
+    if (revoke_lock == _gf_true) {
+        clrlk_clear_inodelk(this, pinode, dom, &args, &bcount, &gcount,
+                            &op_errno);
+        gf_log(this->name, GF_LOG_WARNING,
+               "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+               "age: %ld sec] - Inode lock revoked:  %d granted & %d "
+               "blocked locks cleared",
+               reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec,
+               gcount, bcount);
+    }
+    return revoke_lock;
+}
+
+void
+inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock,
+                                struct timespec *now, struct list_head *contend)
+{
+    posix_locks_private_t *priv;
+    int64_t elapsed;
+
+    priv = this->private;
+
+    /* If this lock is in a list, it means that we are about to send a
+     * notification for it, so no need to do anything else. */
+    if (!list_empty(&lock->contend)) {
+        return;
+    }
+
+    elapsed = now->tv_sec;
+    elapsed -= lock->contention_time.tv_sec;
+    if (now->tv_nsec < lock->contention_time.tv_nsec) {
+        elapsed--;
+    }
+    if (elapsed < priv->notify_contention_delay) {
+        return;
+    }
+
+    /* All contention notifications will be sent outside of the locked
+     * region. This means that currently granted locks might have already
+     * been unlocked by that time. To avoid the lock or the inode to be
+     * destroyed before we process them, we take an additional reference
+     * on both. */
+    inode_ref(lock->pl_inode->inode);
+    __pl_inodelk_ref(lock);
+
+    lock->contention_time = *now;
+
+    list_add_tail(&lock->contend, contend);
+}
+
+void
+inodelk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+    struct gf_upcall up;
+    struct gf_upcall_inodelk_contention lc;
+    pl_inode_lock_t *lock;
+    pl_inode_t *pl_inode;
+    client_t *client;
+    gf_boolean_t notify;
+
+    while (!list_empty(contend)) {
+        lock = list_first_entry(contend, pl_inode_lock_t, contend);
+
+        pl_inode = lock->pl_inode;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        /* If the lock has already been released, no notification is
+         * sent. We clear the notification time in this case. */
+        notify = !list_empty(&lock->list);
+        if (!notify) {
+            lock->contention_time.tv_sec = 0;
+            lock->contention_time.tv_nsec = 0;
+        } else {
+            memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock));
+            lc.pid = lock->client_pid;
+            lc.domain = lock->volume;
+            lc.xdata = NULL;
+
+            gf_uuid_copy(up.gfid, lock->pl_inode->gfid);
+            client = (client_t *)lock->client;
+            if (client == NULL) {
+                /* A NULL client can be found if the inodelk
+                 * was issued by a server side xlator. */
+                up.client_uid = NULL;
+            } else {
+                up.client_uid = client->client_uid;
+            }
         }
-        return revoke_lock;
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        if (notify) {
+            up.event_type = GF_UPCALL_INODELK_CONTENTION;
+            up.data = &lc;
+
+            if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+                gf_msg_debug(this->name, 0,
+                             "Inodelk contention notification "
+                             "failed");
+            } else {
+                gf_msg_debug(this->name, 0,
+                             "Inodelk contention notification "
+                             "sent");
+            }
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        list_del_init(&lock->contend);
+        __pl_inodelk_unref(lock);
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        inode_unref(pl_inode->inode);
+    }
 }
 
 /* Determine if lock is grantable or not */
 static pl_inode_lock_t *
-__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)
+__inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+                    struct timespec *now, struct list_head *contend)
 {
-        pl_inode_lock_t *l = NULL;
-        pl_inode_lock_t *ret = NULL;
-        if (list_empty (&dom->inodelk_list))
-                goto out;
-        list_for_each_entry (l, &dom->inodelk_list, list){
-                if (inodelk_conflict (lock, l) &&
-                    !same_inodelk_owner (lock, l)) {
-                        ret = l;
-                        goto out;
+    pl_inode_lock_t *l = NULL;
+    pl_inode_lock_t *ret = NULL;
+
+    list_for_each_entry(l, &dom->inodelk_list, list)
+    {
+        if (inodelk_conflict(lock, l) && !same_inodelk_owner(lock, l)) {
+            if (ret == NULL) {
+                ret = l;
+                if (contend == NULL) {
+                    break;
                 }
+            }
+            inodelk_contention_notify_check(this, l, now, contend);
         }
-out:
-        return ret;
+    }
+
+    return ret;
 }
 
 static pl_inode_lock_t *
-__blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock)
+__blocked_lock_conflict(pl_dom_list_t *dom, pl_inode_lock_t *lock)
 {
-        pl_inode_lock_t *l   = NULL;
-        pl_inode_lock_t *ret = NULL;
+    pl_inode_lock_t *l = NULL;
 
-        if (list_empty (&dom->blocked_inodelks))
-                return NULL;
-
-        list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) {
-                if (inodelk_conflict (lock, l)) {
-                        ret = l;
-                        goto out;
-                }
+    list_for_each_entry(l, &dom->blocked_inodelks, blocked_locks)
+    {
+        if (inodelk_conflict(lock, l)) {
+            return l;
         }
+    }
 
-out:
-        return ret;
+    return NULL;
 }
 
 static int
-__owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock)
+__owner_has_lock(pl_dom_list_t *dom, pl_inode_lock_t *newlock)
 {
-        pl_inode_lock_t *lock = NULL;
+    pl_inode_lock_t *lock = NULL;
 
-        list_for_each_entry (lock, &dom->inodelk_list, list) {
-                if (same_inodelk_owner (lock, newlock))
-                        return 1;
-        }
+    list_for_each_entry(lock, &dom->inodelk_list, list)
+    {
+        if (same_inodelk_owner(lock, newlock))
+            return 1;
+    }
 
-        list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
-                if (same_inodelk_owner (lock, newlock))
-                        return 1;
-        }
+    list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks)
+    {
+        if (same_inodelk_owner(lock, newlock))
+            return 1;
+    }
 
-        return 0;
+    return 0;
 }
 
+static int
+__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+                   int can_block)
+{
+    if (can_block == 0) {
+        goto out;
+    }
+
+    lock->blkd_time = gf_time();
+    list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks);
+
+    gf_msg_trace(this->name, 0,
+                 "%s (pid=%d) (lk-owner=%s) %" PRId64
+                 " - "
+                 "%" PRId64 " => Blocked",
+                 lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+                 lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+                 lock->user_flock.l_len);
+
+    pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                   lock->volume);
+out:
+    return -EAGAIN;
+}
 
 /* Determines if lock can be granted and adds the lock. If the lock
  * is blocking, adds it to the blocked_inodelks list of the domain.
  */
 static int
-__lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
-                int can_block,  pl_dom_list_t *dom)
+__lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+               int can_block, pl_dom_list_t *dom, struct timespec *now,
+               struct list_head *contend)
 {
-        pl_inode_lock_t *conf = NULL;
-        int ret = -EINVAL;
-
-        conf = __inodelk_grantable (dom, lock);
-        if (conf) {
-                ret = -EAGAIN;
-                if (can_block == 0)
-                        goto out;
+    pl_inode_lock_t *conf = NULL;
+    int ret;
 
-                gettimeofday (&lock->blkd_time, NULL);
-                list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
-                gf_log (this->name, GF_LOG_TRACE,
-                        "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked",
-                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-                        lock->client_pid,
-                        lkowner_utoa (&lock->owner),
-                        lock->user_flock.l_start,
-                        lock->user_flock.l_len);
-
-
-                goto out;
-        }
-
-        /* To prevent blocked locks starvation, check if there are any blocked
-         * locks thay may conflict with this lock. If there is then don't grant
-         * the lock. BUT grant the lock if the owner already has lock to allow
-         * nested locks.
-         * Example:
-         * SHD from Machine1 takes (gfid, 0-infinity) and is granted.
-         * SHD from machine2 takes (gfid, 0-infinity) and is blocked.
-         * When SHD from Machine1 takes (gfid, 0-128KB) it
-         * needs to be granted, without which the earlier lock on 0-infinity
-         * will not be unlocked by SHD from Machine1.
-         * TODO: Find why 'owner_has_lock' is checked even for blocked locks.
-         */
-        if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
-                ret = -EAGAIN;
-                if (can_block == 0)
-                        goto out;
-
-                gettimeofday (&lock->blkd_time, NULL);
-                list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "Lock is grantable, but blocking to prevent starvation");
-                gf_log (this->name, GF_LOG_TRACE,
-                        "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked",
-                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-                        lock->client_pid,
-                        lkowner_utoa (&lock->owner),
-                        lock->user_flock.l_start,
-                        lock->user_flock.l_len);
-
-
-                goto out;
+    ret = pl_inode_remove_inodelk(pl_inode, lock);
+    if (ret < 0) {
+        return ret;
+    }
+    if (ret == 0) {
+        conf = __inodelk_grantable(this, dom, lock, now, contend);
+    }
+    if ((ret > 0) || (conf != NULL)) {
+        return __lock_blocked_add(this, dom, lock, can_block);
+    }
+
+    /* To prevent blocked locks starvation, check if there are any blocked
+     * locks thay may conflict with this lock. If there is then don't grant
+     * the lock. BUT grant the lock if the owner already has lock to allow
+     * nested locks.
+     * Example:
+     * SHD from Machine1 takes (gfid, 0-infinity) and is granted.
+     * SHD from machine2 takes (gfid, 0-infinity) and is blocked.
+     * When SHD from Machine1 takes (gfid, 0-128KB) it
+     * needs to be granted, without which the earlier lock on 0-infinity
+     * will not be unlocked by SHD from Machine1.
+     * TODO: Find why 'owner_has_lock' is checked even for blocked locks.
+     */
+    if (__blocked_lock_conflict(dom, lock) && !(__owner_has_lock(dom, lock))) {
+        if (can_block != 0) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock is grantable, but blocking to prevent "
+                   "starvation");
         }
-        __pl_inodelk_ref (lock);
-        gettimeofday (&lock->granted_time, NULL);
-        list_add (&lock->list, &dom->inodelk_list);
 
-        ret = 0;
+        return __lock_blocked_add(this, dom, lock, can_block);
+    }
+    __pl_inodelk_ref(lock);
+    lock->granted_time = gf_time();
+    list_add(&lock->list, &dom->inodelk_list);
 
-out:
-        return ret;
+    return 0;
 }
 
 /* Return true if the two inodelks have exactly same lock boundaries */
 static int
-inodelks_equal (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+inodelks_equal(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
 {
-        if ((l1->fl_start == l2->fl_start) &&
-            (l1->fl_end == l2->fl_end))
-                return 1;
+    if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end))
+        return 1;
 
-        return 0;
+    return 0;
 }
 
-
 static pl_inode_lock_t *
-find_matching_inodelk (pl_inode_lock_t *lock, pl_dom_list_t *dom)
+find_matching_inodelk(pl_inode_lock_t *lock, pl_dom_list_t *dom)
 {
-        pl_inode_lock_t *l = NULL;
-        list_for_each_entry (l, &dom->inodelk_list, list) {
-                if (inodelks_equal (l, lock) &&
-                    same_inodelk_owner (l, lock))
-                        return l;
-        }
-        return NULL;
+    pl_inode_lock_t *l = NULL;
+    list_for_each_entry(l, &dom->inodelk_list, list)
+    {
+        if (inodelks_equal(l, lock) && same_inodelk_owner(l, lock))
+            return l;
+    }
+    return NULL;
 }
 
 /* Set F_UNLCK removes a lock which has the exact same lock boundaries
  * as the UNLCK lock specifies. If such a lock is not found, returns invalid
  */
 static pl_inode_lock_t *
-__inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
+__inode_unlock_lock(xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
 {
-
-        pl_inode_lock_t *conf = NULL;
-
-        conf = find_matching_inodelk (lock, dom);
-        if (!conf) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        " Matching lock not found for unlock %llu-%llu, by %s "
-                        "on %p", (unsigned long long)lock->fl_start,
-                        (unsigned long long)lock->fl_end,
-                        lkowner_utoa (&lock->owner), lock->client);
-                goto out;
-        }
-        __delete_inode_lock (conf);
-        gf_log (this->name, GF_LOG_DEBUG,
-                " Matching lock found for unlock %llu-%llu, by %s on %p",
-                (unsigned long long)lock->fl_start,
-                (unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner),
-                lock->client);
+    pl_inode_lock_t *conf = NULL;
+    inode_t *inode = NULL;
+
+    inode = lock->pl_inode->inode;
+
+    conf = find_matching_inodelk(lock, dom);
+    if (!conf) {
+        gf_log(this->name, GF_LOG_ERROR,
+               " Matching lock not found for unlock %llu-%llu, by %s "
+               "on %p for gfid:%s",
+               (unsigned long long)lock->fl_start,
+               (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner),
+               lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN");
+        goto out;
+    }
+    __delete_inode_lock(conf);
+    gf_log(this->name, GF_LOG_DEBUG,
+           " Matching lock found for unlock %llu-%llu, by %s on %p for gfid:%s",
+           (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end,
+           lkowner_utoa(&lock->owner), lock->client,
+           inode ? uuid_utoa(inode->gfid) : "UNKNOWN");
 
 out:
-        return conf;
+    return conf;
 }
 
-
-static void
-__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
-                             struct list_head *granted, pl_dom_list_t *dom)
+void
+__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                            struct list_head *granted, pl_dom_list_t *dom,
+                            struct timespec *now, struct list_head *contend)
 {
-        int              bl_ret = 0;
-        pl_inode_lock_t *bl = NULL;
-        pl_inode_lock_t *tmp = NULL;
-
-        struct list_head blocked_list;
+    pl_inode_lock_t *bl = NULL;
+    pl_inode_lock_t *tmp = NULL;
 
-        INIT_LIST_HEAD (&blocked_list);
-        list_splice_init (&dom->blocked_inodelks, &blocked_list);
+    struct list_head blocked_list;
 
-        list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) {
+    INIT_LIST_HEAD(&blocked_list);
+    list_splice_init(&dom->blocked_inodelks, &blocked_list);
 
-                list_del_init (&bl->blocked_locks);
+    list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks)
+    {
+        list_del_init(&bl->blocked_locks);
 
-                bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom);
+        bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
 
-                if (bl_ret == 0) {
-                        list_add (&bl->blocked_locks, granted);
-                }
+        if (bl->status != -EAGAIN) {
+            list_add_tail(&bl->blocked_locks, granted);
         }
-        return;
+    }
 }
 
-/* Grant all inodelks blocked on a lock */
 void
-grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
-                           pl_dom_list_t *dom)
+unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
+                      struct list_head *granted)
 {
-        struct list_head granted;
-        pl_inode_lock_t *lock;
-        pl_inode_lock_t *tmp;
-
-        INIT_LIST_HEAD (&granted);
-
-        pthread_mutex_lock (&pl_inode->mutex);
-        {
-                __grant_blocked_inode_locks (this, pl_inode, &granted, dom);
-        }
-        pthread_mutex_unlock (&pl_inode->mutex);
-
-        list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) {
-                gf_log (this->name, GF_LOG_TRACE,
-                        "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Granted",
-                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-                        lock->client_pid,
-                        lkowner_utoa (&lock->owner),
-                        lock->user_flock.l_start,
-                        lock->user_flock.l_len);
-
-                pl_trace_out (this, lock->frame, NULL, NULL, F_SETLKW,
-                              &lock->user_flock, 0, 0, lock->volume);
-
-                STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL);
-		lock->frame = NULL;
+    pl_inode_lock_t *lock;
+    pl_inode_lock_t *tmp;
+    int32_t op_ret;
+    int32_t op_errno;
+
+    list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+    {
+        if (lock->status == 0) {
+            op_ret = 0;
+            op_errno = 0;
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                   " => Granted",
+                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                   lock->client_pid, lkowner_utoa(&lock->owner),
+                   lock->user_flock.l_start, lock->user_flock.l_len);
+        } else {
+            op_ret = -1;
+            op_errno = -lock->status;
         }
+        pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                     op_ret, op_errno, lock->volume);
+
+        STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL);
+        lock->frame = NULL;
+    }
 
-        pthread_mutex_lock (&pl_inode->mutex);
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
         {
-                list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) {
-                        list_del_init (&lock->blocked_locks);
-                        __pl_inodelk_unref (lock);
-                }
+            list_del_init(&lock->blocked_locks);
+            __pl_inodelk_unref(lock);
         }
-        pthread_mutex_unlock (&pl_inode->mutex);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
 }
 
+/* Grant all inodelks blocked on a lock */
+void
+grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                          pl_dom_list_t *dom, struct timespec *now,
+                          struct list_head *contend)
+{
+    struct list_head granted;
+
+    INIT_LIST_HEAD(&granted);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
+                                    contend);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    unwind_granted_inodes(this, pl_inode, &granted);
+}
 
 static void
-pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
+pl_inodelk_log_cleanup(pl_inode_lock_t *lock)
 {
-	pl_inode_t *pl_inode = NULL;
+    pl_inode_t *pl_inode = NULL;
 
-	pl_inode = lock->pl_inode;
+    pl_inode = lock->pl_inode;
 
-        gf_log (THIS->name, GF_LOG_WARNING, "releasing lock on %s held by "
-                "{client=%p, pid=%"PRId64" lk-owner=%s}",
-                uuid_utoa (pl_inode->gfid), lock->client,
-                (uint64_t) lock->client_pid, lkowner_utoa (&lock->owner));
+    gf_log(THIS->name, GF_LOG_WARNING,
+           "releasing lock on %s held by "
+           "{client=%p, pid=%" PRId64 " lk-owner=%s}",
+           uuid_utoa(pl_inode->gfid), lock->client, (uint64_t)lock->client_pid,
+           lkowner_utoa(&lock->owner));
 }
 
-
 /* Release all inodelks from this client */
 int
-pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
+pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
 {
-        pl_inode_lock_t *tmp = NULL;
-        pl_inode_lock_t *l = NULL;
-	pl_dom_list_t *dom = NULL;
-        pl_inode_t *pl_inode = NULL;
-
-        struct list_head released;
-        struct list_head unwind;
-
-        INIT_LIST_HEAD (&released);
-        INIT_LIST_HEAD (&unwind);
-
-	pthread_mutex_lock (&ctx->lock);
+    posix_locks_private_t *priv;
+    pl_inode_lock_t *tmp = NULL;
+    pl_inode_lock_t *l = NULL;
+    pl_dom_list_t *dom = NULL;
+    pl_inode_t *pl_inode = NULL;
+    struct list_head *pcontend = NULL;
+    struct list_head released;
+    struct list_head unwind;
+    struct list_head contend;
+    struct timespec now = {};
+
+    priv = this->private;
+
+    INIT_LIST_HEAD(&released);
+    INIT_LIST_HEAD(&unwind);
+
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        list_for_each_entry_safe(l, tmp, &ctx->inodelk_lockers, client_list)
         {
-                list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
-					  client_list) {
-			pl_inodelk_log_cleanup (l);
-
-			pl_inode = l->pl_inode;
-
-			pthread_mutex_lock (&pl_inode->mutex);
-			{
-                        /* If the inodelk object is part of granted list but not
-                         * blocked list, then perform the following actions:
-                         * i.   delete the object from granted list;
-                         * ii.  grant other locks (from other clients) that may
-                         *      have been blocked on this inodelk; and
-                         * iii. unref the object.
-                         *
-                         * If the inodelk object (L1) is part of both granted
-                         * and blocked lists, then this means that a parallel
-                         * unlock on another inodelk (L2 say) may have 'granted'
-                         * L1 and added it to 'granted' list in
-                         * __grant_blocked_node_locks() (although using the
-                         * 'blocked_locks' member). In that case, the cleanup
-                         * codepath must try and grant other overlapping
-                         * blocked inodelks from other clients, now that L1 is
-                         * out of their way and then unref L1 in the end, and
-                         * leave it to the other thread (the one executing
-                         * unlock codepath) to unwind L1's frame, delete it from
-                         * blocked_locks list, and perform the last unref on L1.
-                         *
-                         * If the inodelk object (L1) is part of blocked list
-                         * only, the cleanup code path must:
-                         * i.   delete it from the blocked_locks list inside
-                         *      this critical section,
-                         * ii.  unwind its frame with EAGAIN,
-                         * iii. try and grant blocked inode locks from other
-                         *      clients that were otherwise grantable, but just
-                         *      got blocked to avoid leaving L1 to starve
-                         *      forever.
-                         * iv.  unref the object.
-                         */
-                                list_del_init (&l->client_list);
-
-                                if (!list_empty (&l->list)) {
-                                        __delete_inode_lock (l);
-                                        list_add_tail (&l->client_list,
-                                                       &released);
-                                } else {
-                                        list_del_init(&l->blocked_locks);
-                                        list_add_tail (&l->client_list,
-                                                       &unwind);
-                                }
-                        }
-			pthread_mutex_unlock (&pl_inode->mutex);
+            pl_inodelk_log_cleanup(l);
+
+            pl_inode = l->pl_inode;
+
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                /* If the inodelk object is part of granted list but not
+                 * blocked list, then perform the following actions:
+                 * i.   delete the object from granted list;
+                 * ii.  grant other locks (from other clients) that may
+                 *      have been blocked on this inodelk; and
+                 * iii. unref the object.
+                 *
+                 * If the inodelk object (L1) is part of both granted
+                 * and blocked lists, then this means that a parallel
+                 * unlock on another inodelk (L2 say) may have 'granted'
+                 * L1 and added it to 'granted' list in
+                 * __grant_blocked_inode_locks() (although using the
+                 * 'blocked_locks' member). In that case, the cleanup
+                 * codepath must try and grant other overlapping
+                 * blocked inodelks from other clients, now that L1 is
+                 * out of their way and then unref L1 in the end, and
+                 * leave it to the other thread (the one executing
+                 * unlock codepath) to unwind L1's frame, delete it from
+                 * blocked_locks list, and perform the last unref on L1.
+                 *
+                 * If the inodelk object (L1) is part of blocked list
+                 * only, the cleanup code path must:
+                 * i.   delete it from the blocked_locks list inside
+                 *      this critical section,
+                 * ii.  unwind its frame with EAGAIN,
+                 * iii. try and grant blocked inode locks from other
+                 *      clients that were otherwise grantable, but just
+                 *      got blocked to avoid leaving L1 to starve
+                 *      forever.
+                 * iv.  unref the object.
+                 */
+                list_del_init(&l->client_list);
+
+                if (!list_empty(&l->list)) {
+                    __delete_inode_lock(l);
+                    list_add_tail(&l->client_list, &released);
+                } else {
+                    list_del_init(&l->blocked_locks);
+                    list_add_tail(&l->client_list, &unwind);
                 }
-	}
-        pthread_mutex_unlock (&ctx->lock);
-
-        list_for_each_entry_safe (l, tmp, &unwind, client_list) {
-                list_del_init (&l->client_list);
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+        }
+    }
+    pthread_mutex_unlock(&ctx->lock);
 
-                if (l->frame)
-			STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
-					     NULL);
-                list_add_tail (&l->client_list, &released);
+    if (!list_empty(&unwind)) {
+        list_for_each_entry_safe(l, tmp, &unwind, client_list)
+        {
+            list_del_init(&l->client_list);
 
+            if (l->frame)
+                STACK_UNWIND_STRICT(inodelk, l->frame, -1, EAGAIN, NULL);
+            list_add_tail(&l->client_list, &released);
         }
+    }
 
-        list_for_each_entry_safe (l, tmp, &released, client_list) {
-                list_del_init (&l->client_list);
+    if (!list_empty(&released)) {
+        list_for_each_entry_safe(l, tmp, &released, client_list)
+        {
+            list_del_init(&l->client_list);
 
-		pl_inode = l->pl_inode;
+            pl_inode = l->pl_inode;
 
-		dom = get_domain (pl_inode, l->volume);
+            dom = get_domain(pl_inode, l->volume);
 
-		grant_blocked_inode_locks (this, pl_inode, dom);
+            grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend);
 
-		pthread_mutex_lock (&pl_inode->mutex);
-		{
-			__pl_inodelk_unref (l);
-		}
-		pthread_mutex_unlock (&pl_inode->mutex);
-                inode_unref (pl_inode->inode);
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                __pl_inodelk_unref(l);
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+            inode_unref(pl_inode->inode);
         }
+    }
 
-        return 0;
-}
+    if (pcontend != NULL) {
+        inodelk_contention_notify(this, pcontend);
+    }
 
+    return 0;
+}
 
 static int
-pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
-		pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
-                inode_t *inode)
-{
-        posix_locks_private_t  *priv = NULL;
-        int               ret              = -EINVAL;
-        pl_inode_lock_t  *retlock          =  NULL;
-        gf_boolean_t      unref            =  _gf_true;
-        gf_boolean_t      need_inode_unref =  _gf_false;
-        short             fl_type;
-
-	lock->pl_inode = pl_inode;
-        fl_type = lock->fl_type;
-
-        priv = this->private;
-
-        /* Ideally, AFTER a successful lock (both blocking and non-blocking) or
-         * an unsuccessful blocking lock operation, the inode needs to be ref'd.
-         *
-         * But doing so might give room to a race where the lock-requesting
-         * client could send a DISCONNECT just before this thread refs the inode
-         * after the locking is done, and the epoll thread could unref the inode
-         * in cleanup which means the inode's refcount would come down to 0, and
-         * the call to pl_forget() at this point destroys @pl_inode. Now when
-         * the io-thread executing this function tries to access pl_inode,
-         * it could crash on account of illegal memory access.
-         *
-         * To get around this problem, the inode is ref'd once even before
-         * adding the lock into client_list as a precautionary measure.
-         * This way even if there are DISCONNECTs, there will always be 1 extra
-         * ref on the inode, so @pl_inode is still alive until after the
-         * current stack unwinds.
-         */
-        pl_inode->inode = inode_ref (inode);
-
-        if (priv->revocation_secs != 0) {
-                if (lock->fl_type != F_UNLCK) {
-                        __inodelk_prune_stale (this, pl_inode, dom, lock);
-                } else if (priv->monkey_unlocking == _gf_true) {
-                        if (pl_does_monkey_want_stuck_lock ()) {
-                                pthread_mutex_lock (&pl_inode->mutex);
-                                {
-                                        __pl_inodelk_unref (lock);
-                                }
-                                pthread_mutex_unlock (&pl_inode->mutex);
-                                inode_unref (pl_inode->inode);
-                                gf_log (this->name, GF_LOG_WARNING,
-                                    "MONKEY LOCKING (forcing stuck lock)!");
-                                return 0;
-                        }
+pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+               pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
+               inode_t *inode)
+{
+    posix_locks_private_t *priv = NULL;
+    int ret = -EINVAL;
+    pl_inode_lock_t *retlock = NULL;
+    gf_boolean_t unref = _gf_true;
+    gf_boolean_t need_inode_unref = _gf_false;
+    struct list_head *pcontend = NULL;
+    struct list_head contend;
+    struct list_head wake;
+    struct timespec now = {};
+    short fl_type;
+
+    lock->pl_inode = pl_inode;
+    fl_type = lock->fl_type;
+
+    priv = this->private;
+
+    /* Ideally, AFTER a successful lock (both blocking and non-blocking) or
+     * an unsuccessful blocking lock operation, the inode needs to be ref'd.
+     *
+     * But doing so might give room to a race where the lock-requesting
+     * client could send a DISCONNECT just before this thread refs the inode
+     * after the locking is done, and the epoll thread could unref the inode
+     * in cleanup which means the inode's refcount would come down to 0, and
+     * the call to pl_forget() at this point destroys @pl_inode. Now when
+     * the io-thread executing this function tries to access pl_inode,
+     * it could crash on account of illegal memory access.
+     *
+     * To get around this problem, the inode is ref'd once even before
+     * adding the lock into client_list as a precautionary measure.
+     * This way even if there are DISCONNECTs, there will always be 1 extra
+     * ref on the inode, so @pl_inode is still alive until after the
+     * current stack unwinds.
+     */
+    pl_inode->inode = inode_ref(inode);
+
+    if (priv->revocation_secs != 0) {
+        if (lock->fl_type != F_UNLCK) {
+            __inodelk_prune_stale(this, pl_inode, dom, lock);
+        } else if (priv->monkey_unlocking == _gf_true) {
+            if (pl_does_monkey_want_stuck_lock()) {
+                pthread_mutex_lock(&pl_inode->mutex);
+                {
+                    __pl_inodelk_unref(lock);
                 }
+                pthread_mutex_unlock(&pl_inode->mutex);
+                inode_unref(pl_inode->inode);
+                gf_log(this->name, GF_LOG_WARNING,
+                       "MONKEY LOCKING (forcing stuck lock)!");
+                return 0;
+            }
         }
-
-	if (ctx)
-		pthread_mutex_lock (&ctx->lock);
-        pthread_mutex_lock (&pl_inode->mutex);
-        {
-                if (lock->fl_type != F_UNLCK) {
-                        ret = __lock_inodelk (this, pl_inode, lock, can_block, dom);
-                        if (ret == 0) {
-				lock->frame = NULL;
-                                gf_log (this->name, GF_LOG_TRACE,
-                                        "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK",
-                                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-                                        lock->client_pid,
-                                        lkowner_utoa (&lock->owner),
-                                        lock->fl_start,
-                                        lock->fl_end);
-                        } else if (ret == -EAGAIN) {
-                                gf_log (this->name, GF_LOG_TRACE,
-                                        "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => NOK",
-                                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-                                        lock->client_pid,
-                                        lkowner_utoa (&lock->owner),
-                                        lock->user_flock.l_start,
-                                        lock->user_flock.l_len);
-                                if (can_block)
-                                        unref = _gf_false;
-                                /* For all but the case where a non-blocking
-                                 * lock attempt fails, the extra ref taken at
-                                 * the start of this function must be negated.
-                                 */
-                                else
-                                        need_inode_unref = _gf_true;
-                        }
-
-			if (ctx && (!ret || can_block))
-				list_add_tail (&lock->client_list,
-					       &ctx->inodelk_lockers);
-                } else {
-                        /* Irrespective of whether unlock succeeds or not,
-                         * the extra inode ref that was done at the start of
-                         * this function must be negated. Towards this,
-                         * @need_inode_unref flag is set unconditionally here.
-                         */
-                        need_inode_unref = _gf_true;
-                        retlock = __inode_unlock_lock (this, lock, dom);
-                        if (!retlock) {
-                                gf_log (this->name, GF_LOG_DEBUG,
-                                        "Bad Unlock issued on Inode lock");
-                                ret = -EINVAL;
-                                goto out;
-                        }
-			list_del_init (&retlock->client_list);
-			__pl_inodelk_unref (retlock);
-
-                        ret = 0;
+    }
+
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    INIT_LIST_HEAD(&wake);
+
+    if (ctx)
+        pthread_mutex_lock(&ctx->lock);
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (lock->fl_type != F_UNLCK) {
+            ret = __lock_inodelk(this, pl_inode, lock, can_block, dom, &now,
+                                 pcontend);
+            if (ret == 0) {
+                lock->frame = NULL;
+                gf_log(this->name, GF_LOG_TRACE,
+                       "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                       " => OK",
+                       lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                       lock->client_pid, lkowner_utoa(&lock->owner),
+                       lock->fl_start, lock->fl_end);
+            } else if (ret == -EAGAIN) {
+                gf_log(this->name, GF_LOG_TRACE,
+                       "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                       " => NOK",
+                       lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                       lock->client_pid, lkowner_utoa(&lock->owner),
+                       lock->user_flock.l_start, lock->user_flock.l_len);
+                if (can_block) {
+                    unref = _gf_false;
                 }
-out:
-		if (unref)
-			__pl_inodelk_unref (lock);
-        }
-        pthread_mutex_unlock (&pl_inode->mutex);
-	if (ctx)
-		pthread_mutex_unlock (&ctx->lock);
-
-        if (need_inode_unref)
-                inode_unref (pl_inode->inode);
-
-        /* The following (extra) unref corresponds to the ref that
-         * was done at the time the lock was granted.
-         */
-        if ((fl_type == F_UNLCK) && (ret == 0)) {
-                inode_unref (pl_inode->inode);
-                grant_blocked_inode_locks (this, pl_inode, dom);
-        }
+            }
+            /* For all but the case where a non-blocking lock attempt fails
+             * with -EAGAIN, the extra ref taken at the start of this function
+             * must be negated. */
+            need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block);
+            if (ctx && !need_inode_unref) {
+                list_add_tail(&lock->client_list, &ctx->inodelk_lockers);
+            }
+        } else {
+            /* Irrespective of whether unlock succeeds or not,
+             * the extra inode ref that was done at the start of
+             * this function must be negated. Towards this,
+             * @need_inode_unref flag is set unconditionally here.
+             */
+            need_inode_unref = _gf_true;
+            retlock = __inode_unlock_lock(this, lock, dom);
+            if (!retlock) {
+                gf_log(this->name, GF_LOG_DEBUG,
+                       "Bad Unlock issued on Inode lock");
+                ret = -EINVAL;
+                goto out;
+            }
+            list_del_init(&retlock->client_list);
+            __pl_inodelk_unref(retlock);
 
-        return ret;
+            pl_inode_remove_unlocked(this, pl_inode, &wake);
+
+            ret = 0;
+        }
+    out:
+        if (unref)
+            __pl_inodelk_unref(lock);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+    if (ctx)
+        pthread_mutex_unlock(&ctx->lock);
+
+    pl_inode_remove_wake(&wake);
+
+    /* The following (extra) unref corresponds to the ref that
+     * was done at the time the lock was granted.
+     */
+    if ((fl_type == F_UNLCK) && (ret == 0)) {
+        inode_unref(pl_inode->inode);
+        grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend);
+    }
+
+    if (need_inode_unref) {
+        inode_unref(pl_inode->inode);
+    }
+
+    if (pcontend != NULL) {
+        inodelk_contention_notify(this, pcontend);
+    }
+
+    return ret;
 }
 
 /* Create a new inode_lock_t */
-pl_inode_lock_t *
-new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
-                call_frame_t *frame, xlator_t *this, const char *volume,
-                char *conn_id)
+static pl_inode_lock_t *
+new_inode_lock(struct gf_flock *flock, client_t *client, pid_t client_pid,
+               call_frame_t *frame, xlator_t *this, const char *volume,
+               char *conn_id, int32_t *op_errno)
 
 {
-        pl_inode_lock_t *lock = NULL;
-
-        lock = GF_CALLOC (1, sizeof (*lock),
-                          gf_locks_mt_pl_inode_lock_t);
-        if (!lock) {
-                return NULL;
-        }
-
-        lock->fl_start = flock->l_start;
-        lock->fl_type  = flock->l_type;
+    pl_inode_lock_t *lock = NULL;
+
+    if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    lock = GF_CALLOC(1, sizeof(*lock), gf_locks_mt_pl_inode_lock_t);
+    if (!lock) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    lock->fl_start = flock->l_start;
+    lock->fl_type = flock->l_type;
+
+    if (flock->l_len == 0)
+        lock->fl_end = LLONG_MAX;
+    else
+        lock->fl_end = flock->l_start + flock->l_len - 1;
+
+    lock->client = client;
+    lock->client_pid = client_pid;
+    lock->volume = volume;
+    lock->owner = frame->root->lk_owner;
+    lock->frame = frame;
+    lock->this = this;
+
+    if (conn_id) {
+        lock->connection_id = gf_strdup(conn_id);
+    }
+
+    INIT_LIST_HEAD(&lock->list);
+    INIT_LIST_HEAD(&lock->blocked_locks);
+    INIT_LIST_HEAD(&lock->client_list);
+    INIT_LIST_HEAD(&lock->contend);
+    __pl_inodelk_ref(lock);
 
-        if (flock->l_len == 0)
-                lock->fl_end = LLONG_MAX;
-        else
-                lock->fl_end = flock->l_start + flock->l_len - 1;
-
-        lock->client     = client;
-        lock->client_pid = client_pid;
-        lock->volume     = volume;
-        lock->owner      = frame->root->lk_owner;
-        lock->frame      = frame;
-        lock->this       = this;
-
-        if (conn_id) {
-                lock->connection_id = gf_strdup (conn_id);
-        }
-
-        INIT_LIST_HEAD (&lock->list);
-        INIT_LIST_HEAD (&lock->blocked_locks);
-	INIT_LIST_HEAD (&lock->client_list);
-        __pl_inodelk_ref (lock);
-
-        return lock;
+out:
+    return lock;
 }
 
 int32_t
-_pl_convert_volume (const char *volume, char **res)
+_pl_convert_volume(const char *volume, char **res)
 {
-        char    *mdata_vol = NULL;
-        int     ret = 0;
+    char *mdata_vol = NULL;
+    int ret = 0;
 
-        mdata_vol = strrchr (volume, ':');
-        //if the volume already ends with :metadata don't bother
-        if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0))
-                return 0;
-
-        ret = gf_asprintf (res, "%s:metadata", volume);
-        if (ret <= 0)
-                return ENOMEM;
+    mdata_vol = strrchr(volume, ':');
+    // if the volume already ends with :metadata don't bother
+    if (mdata_vol && (strcmp(mdata_vol, ":metadata") == 0))
         return 0;
+
+    ret = gf_asprintf(res, "%s:metadata", volume);
+    if (ret <= 0)
+        return ENOMEM;
+    return 0;
 }
 
 int32_t
-_pl_convert_volume_for_special_range (struct gf_flock *flock,
-                                      const char *volume, char **res)
+_pl_convert_volume_for_special_range(struct gf_flock *flock, const char *volume,
+                                     char **res)
 {
-        int32_t     ret = 0;
+    int32_t ret = 0;
 
-        if ((flock->l_start == LLONG_MAX -1) &&
-            (flock->l_len == 0)) {
-                ret = _pl_convert_volume (volume, res);
-        }
+    if ((flock->l_start == LLONG_MAX - 1) && (flock->l_len == 0)) {
+        ret = _pl_convert_volume(volume, res);
+    }
 
-        return ret;
+    return ret;
 }
 
 /* Common inodelk code called from pl_inodelk and pl_finodelk */
 int
-pl_common_inodelk (call_frame_t *frame, xlator_t *this,
-                   const char *volume, inode_t *inode, int32_t cmd,
-                   struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata)
-{
-        int32_t           op_ret     = -1;
-        int32_t           op_errno   = 0;
-        int               ret        = -1;
-        GF_UNUSED int     dict_ret   = -1;
-        int               can_block  = 0;
-        pl_inode_t *      pinode     = NULL;
-        pl_inode_lock_t * reqlock    = NULL;
-        pl_dom_list_t *   dom        = NULL;
-        char             *res        = NULL;
-        char             *res1       = NULL;
-        char             *conn_id    = NULL;
-        pl_ctx_t         *ctx        = NULL;
-
-        if (xdata)
-                dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
-
-        VALIDATE_OR_GOTO (frame, out);
-        VALIDATE_OR_GOTO (inode, unwind);
-        VALIDATE_OR_GOTO (flock, unwind);
-
-        if ((flock->l_start < 0) || (flock->l_len < 0)) {
-                op_errno = EINVAL;
-                goto unwind;
+pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  inode_t *inode, int32_t cmd, struct gf_flock *flock,
+                  loc_t *loc, fd_t *fd, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int ret = -1;
+    GF_UNUSED int dict_ret = -1;
+    int can_block = 0;
+    short lock_type = 0;
+    pl_inode_t *pinode = NULL;
+    pl_inode_lock_t *reqlock = NULL;
+    pl_dom_list_t *dom = NULL;
+    char *res = NULL;
+    char *res1 = NULL;
+    char *conn_id = NULL;
+    pl_ctx_t *ctx = NULL;
+
+    if (xdata)
+        dict_ret = dict_get_str(xdata, "connection-id", &conn_id);
+
+    VALIDATE_OR_GOTO(frame, out);
+    VALIDATE_OR_GOTO(inode, unwind);
+    VALIDATE_OR_GOTO(flock, unwind);
+
+    if ((flock->l_start < 0) || (flock->l_len < 0)) {
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    op_errno = _pl_convert_volume_for_special_range(flock, volume, &res);
+    if (op_errno)
+        goto unwind;
+    if (res)
+        volume = res;
+
+    pl_trace_in(this, frame, fd, loc, cmd, flock, volume);
+
+    if (frame->root->client) {
+        ctx = pl_ctx_get(frame->root->client, this);
+        if (!ctx) {
+            op_errno = ENOMEM;
+            gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+            goto unwind;
         }
+    }
 
-        op_errno = _pl_convert_volume_for_special_range (flock, volume, &res);
-        if (op_errno)
-                goto unwind;
-        if (res)
-                volume = res;
-
-        pl_trace_in (this, frame, fd, loc, cmd, flock, volume);
-
-	if (frame->root->client) {
-		ctx = pl_ctx_get (frame->root->client, this);
-		if (!ctx) {
-			op_errno = ENOMEM;
-			gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed");
-			goto unwind;
-		}
-	}
-
-        pinode = pl_inode_get (this, inode);
-        if (!pinode) {
-                op_errno = ENOMEM;
-                goto unwind;
-        }
+    pinode = pl_inode_get(this, inode, NULL);
+    if (!pinode) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
 
-        dom = get_domain (pinode, volume);
-        if (!dom) {
-                op_errno = ENOMEM;
-                goto unwind;
-        }
+    dom = get_domain(pinode, volume);
+    if (!dom) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
 
-        reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid,
-                                  frame, this, dom->domain, conn_id);
+    reqlock = new_inode_lock(flock, frame->root->client, frame->root->pid,
+                             frame, this, dom->domain, conn_id, &op_errno);
 
-        if (!reqlock) {
-                op_ret = -1;
-                op_errno = ENOMEM;
-                goto unwind;
-        }
+    if (!reqlock) {
+        op_ret = -1;
+        goto unwind;
+    }
 
-
-        switch (cmd) {
+    switch (cmd) {
         case F_SETLKW:
-                can_block = 1;
+            can_block = 1;
 
-                /* fall through */
+            /* fall through */
 
         case F_SETLK:
-                memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock));
-                ret = pl_inode_setlk (this, ctx, pinode, reqlock, can_block,
-				      dom, inode);
-
-                if (ret < 0) {
-                        if ((can_block) && (F_UNLCK != flock->l_type)) {
-                                pl_trace_block (this, frame, fd, loc,
-                                                cmd, flock, volume);
-                                goto out;
-                        }
-                        gf_log (this->name, GF_LOG_TRACE, "returning EAGAIN");
-                        op_errno = -ret;
-                        goto unwind;
+            lock_type = flock->l_type;
+            memcpy(&reqlock->user_flock, flock, sizeof(struct gf_flock));
+            ret = pl_inode_setlk(this, ctx, pinode, reqlock, can_block, dom,
+                                 inode);
+
+            if (ret < 0) {
+                if (ret == -EAGAIN) {
+                    if (can_block && (F_UNLCK != lock_type)) {
+                        goto out;
+                    }
+                    gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
+                } else {
+                    gf_log(this->name, GF_LOG_TRACE, "returning %d", ret);
                 }
-                break;
+                op_errno = -ret;
+                goto unwind;
+            }
+            break;
 
         default:
-                op_errno = ENOTSUP;
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "Lock command F_GETLK not supported for [f]inodelk "
-                        "(cmd=%d)",
-                        cmd);
-                goto unwind;
-        }
+            op_errno = ENOTSUP;
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock command F_GETLK not supported for [f]inodelk "
+                   "(cmd=%d)",
+                   cmd);
+            goto unwind;
+    }
 
-        op_ret = 0;
+    op_ret = 0;
 
 unwind:
-        if (flock != NULL)
-                pl_trace_out (this, frame, fd, loc, cmd, flock, op_ret,
-                              op_errno, volume);
+    if (flock != NULL)
+        pl_trace_out(this, frame, fd, loc, cmd, flock, op_ret, op_errno,
+                     volume);
 
-        STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL);
+    STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, NULL);
 out:
-        GF_FREE (res);
-        GF_FREE (res1);
-        return 0;
+    GF_FREE(res);
+    GF_FREE(res1);
+    return 0;
 }
 
 int
-pl_inodelk (call_frame_t *frame, xlator_t *this,
-            const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock,
-            dict_t *xdata)
+pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *flock, dict_t *xdata)
 {
-        pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock,
-                           loc, NULL, xdata);
+    pl_common_inodelk(frame, this, volume, loc->inode, cmd, flock, loc, NULL,
+                      xdata);
 
-        return 0;
+    return 0;
 }
 
 int
-pl_finodelk (call_frame_t *frame, xlator_t *this,
-             const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock,
-             dict_t *xdata)
+pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata)
 {
-        pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock,
-                           NULL, fd, xdata);
-
-        return 0;
+    pl_common_inodelk(frame, this, volume, fd->inode, cmd, flock, NULL, fd,
+                      xdata);
 
+    return 0;
 }
 
 static int32_t
-__get_inodelk_dom_count (pl_dom_list_t *dom)
+__get_inodelk_dom_count(pl_dom_list_t *dom)
 {
-        pl_inode_lock_t     *lock   = NULL;
-        int32_t             count   = 0;
-
-        list_for_each_entry (lock, &dom->inodelk_list, list) {
-                count++;
-        }
-        list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
-                count++;
-        }
-        return count;
+    pl_inode_lock_t *lock = NULL;
+    int32_t count = 0;
+
+    list_for_each_entry(lock, &dom->inodelk_list, list) { count++; }
+    list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks)
+    {
+        count++;
+    }
+    return count;
 }
 
 /* Returns the no. of locks (blocked/granted) held on a given domain name
  * If @domname is NULL, returns the no. of locks in all the domains present.
  * If @domname is non-NULL and non-existent, returns 0 */
 int32_t
-__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname)
+__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname)
 {
-        int32_t            count  = 0;
-        pl_dom_list_t     *dom    = NULL;
-
-        list_for_each_entry (dom, &pl_inode->dom_list, inode_list) {
-                if (domname) {
-                        if (strcmp (domname, dom->domain) == 0) {
-                                count = __get_inodelk_dom_count (dom);
-                                goto out;
-                        }
-
-                } else {
-                    /* Counting locks from all domains */
-                        count += __get_inodelk_dom_count (dom);
+    int32_t count = 0;
+    pl_dom_list_t *dom = NULL;
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        if (domname) {
+            if (strcmp(domname, dom->domain) == 0) {
+                count = __get_inodelk_dom_count(dom);
+                goto out;
+            }
 
-                }
+        } else {
+            /* Counting locks from all domains */
+            count += __get_inodelk_dom_count(dom);
         }
+    }
 
 out:
-        return count;
+    return count;
 }
 
 int32_t
-get_inodelk_count (xlator_t *this, inode_t *inode, char *domname)
+get_inodelk_count(xlator_t *this, inode_t *inode, char *domname)
 {
-        pl_inode_t   *pl_inode = NULL;
-        uint64_t      tmp_pl_inode = 0;
-        int           ret      = 0;
-        int32_t       count    = 0;
+    pl_inode_t *pl_inode = NULL;
+    uint64_t tmp_pl_inode = 0;
+    int ret = 0;
+    int32_t count = 0;
 
-        ret = inode_ctx_get (inode, this, &tmp_pl_inode);
-        if (ret != 0) {
-                goto out;
-        }
+    ret = inode_ctx_get(inode, this, &tmp_pl_inode);
+    if (ret != 0) {
+        goto out;
+    }
 
-        pl_inode = (pl_inode_t *)(long) tmp_pl_inode;
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
 
-        pthread_mutex_lock (&pl_inode->mutex);
-        {
-                count = __get_inodelk_count (this, pl_inode, domname);
-        }
-        pthread_mutex_unlock (&pl_inode->mutex);
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        count = __get_inodelk_count(this, pl_inode, domname);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
 
 out:
-        return count;
+    return count;
 }