summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libglusterfs/src/upcall-utils.h17
-rw-r--r--rpc/rpc-lib/src/protocol-common.h2
-rw-r--r--rpc/xdr/src/glusterfs3.h156
-rw-r--r--rpc/xdr/src/glusterfs4-xdr.x17
-rw-r--r--rpc/xdr/src/libgfxdr.sym4
-rw-r--r--tests/basic/ec/lock-contention.t62
-rw-r--r--tests/volume.rc12
-rw-r--r--xlators/cluster/ec/src/ec-common.c253
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-types.h2
-rw-r--r--xlators/cluster/ec/src/ec.c77
-rw-r--r--xlators/features/locks/src/clear.c36
-rw-r--r--xlators/features/locks/src/common.h12
-rw-r--r--xlators/features/locks/src/entrylk.c290
-rw-r--r--xlators/features/locks/src/inodelk.c309
-rw-r--r--xlators/features/locks/src/locks.h8
-rw-r--r--xlators/features/locks/src/pl-messages.h4
-rw-r--r--xlators/features/locks/src/posix.c40
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c20
-rw-r--r--xlators/protocol/client/src/client-callback.c97
-rw-r--r--xlators/protocol/client/src/client-messages.h4
-rw-r--r--xlators/protocol/server/src/server.c44
22 files changed, 1203 insertions, 264 deletions
diff --git a/libglusterfs/src/upcall-utils.h b/libglusterfs/src/upcall-utils.h
index 3b5dce33e45..48d10382c10 100644
--- a/libglusterfs/src/upcall-utils.h
+++ b/libglusterfs/src/upcall-utils.h
@@ -63,6 +63,8 @@ typedef enum {
GF_UPCALL_EVENT_NULL,
GF_UPCALL_CACHE_INVALIDATION,
GF_UPCALL_RECALL_LEASE,
+ GF_UPCALL_INODELK_CONTENTION,
+ GF_UPCALL_ENTRYLK_CONTENTION,
} gf_upcall_event_t;
struct gf_upcall {
@@ -88,4 +90,19 @@ struct gf_upcall_recall_lease {
dict_t *dict;
};
+struct gf_upcall_inodelk_contention {
+ struct gf_flock flock;
+ pid_t pid;
+ const char *domain;
+ dict_t *xdata;
+};
+
+struct gf_upcall_entrylk_contention {
+ uint32_t type;
+ pid_t pid;
+ const char *name;
+ const char *domain;
+ dict_t *xdata;
+};
+
#endif /* _UPCALL_UTILS_H */
diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h
index aee34302205..aafd94400c6 100644
--- a/rpc/rpc-lib/src/protocol-common.h
+++ b/rpc/rpc-lib/src/protocol-common.h
@@ -150,6 +150,8 @@ enum gf_cbk_procnum {
GF_CBK_CHILD_DOWN,
GF_CBK_RECALL_LEASE,
GF_CBK_STATEDUMP,
+ GF_CBK_INODELK_CONTENTION,
+ GF_CBK_ENTRYLK_CONTENTION,
GF_CBK_MAXVALUE,
};
diff --git a/rpc/xdr/src/glusterfs3.h b/rpc/xdr/src/glusterfs3.h
index 2da5594a347..eef39416b5c 100644
--- a/rpc/xdr/src/glusterfs3.h
+++ b/rpc/xdr/src/glusterfs3.h
@@ -419,6 +419,162 @@ gf_proto_cache_invalidation_to_upcall (xlator_t *this,
return ret;
}
+static inline int
+gf_proto_inodelk_contention_to_upcall (struct gfs4_inodelk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_inodelk_contention *tmp = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data;
+
+ gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid);
+
+ gf_proto_flock_to_flock(&lc->flock, &tmp->flock);
+ tmp->pid = lc->pid;
+ tmp->domain = lc->domain;
+ if ((tmp->domain != NULL) && (*tmp->domain == 0)) {
+ tmp->domain = NULL;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val,
+ lc->xdata.xdata_len, ret, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_inodelk_contention_from_upcall (xlator_t *this,
+ struct gfs4_inodelk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_inodelk_contention *tmp = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data;
+
+ gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid);
+
+ gf_proto_flock_from_flock(&lc->flock, &tmp->flock);
+ lc->pid = tmp->pid;
+ lc->domain = (char *)tmp->domain;
+ if (lc->domain == NULL) {
+ lc->domain = "";
+ }
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val,
+ lc->xdata.xdata_len, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_entrylk_contention_to_upcall (struct gfs4_entrylk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_entrylk_contention *tmp = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data;
+
+ gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid);
+
+ tmp->type = lc->type;
+ tmp->name = lc->name;
+ if ((tmp->name != NULL) && (*tmp->name == 0)) {
+ tmp->name = NULL;
+ }
+ tmp->pid = lc->pid;
+ tmp->domain = lc->domain;
+ if ((tmp->domain != NULL) && (*tmp->domain == 0)) {
+ tmp->domain = NULL;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val,
+ lc->xdata.xdata_len, ret, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
+static inline int
+gf_proto_entrylk_contention_from_upcall (xlator_t *this,
+ struct gfs4_entrylk_contention_req *lc,
+ struct gf_upcall *gf_up_data)
+{
+ struct gf_upcall_entrylk_contention *tmp = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO(this->name, lc, out);
+ GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out);
+
+ tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data;
+
+ gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid);
+
+ lc->type = tmp->type;
+ lc->name = (char *)tmp->name;
+ if (lc->name == NULL) {
+ lc->name = "";
+ }
+ lc->pid = tmp->pid;
+ lc->domain = (char *)tmp->domain;
+ if (lc->domain == NULL) {
+ lc->domain = "";
+ }
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val,
+ lc->xdata.xdata_len, op_errno, out);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ ret = -op_errno;
+ }
+
+ return ret;
+}
+
extern int dict_to_xdr (dict_t *this, gfx_dict *xdict);
extern int xdr_to_dict (gfx_dict *xdict, dict_t **to);
diff --git a/rpc/xdr/src/glusterfs4-xdr.x b/rpc/xdr/src/glusterfs4-xdr.x
index 7396b566fa7..ef0cfde0802 100644
--- a/rpc/xdr/src/glusterfs4-xdr.x
+++ b/rpc/xdr/src/glusterfs4-xdr.x
@@ -86,3 +86,20 @@ struct gfs4_namelink_req {
string bname<>;
opaque xdata<>;
};
+
+struct gfs4_inodelk_contention_req {
+ opaque gfid[16];
+ struct gf_proto_flock flock;
+ unsigned int pid;
+ string domain<>;
+ opaque xdata<>;
+};
+
+struct gfs4_entrylk_contention_req {
+ opaque gfid[16];
+ unsigned int type;
+ unsigned int pid;
+ string name<>;
+ string domain<>;
+ opaque xdata<>;
+};
diff --git a/rpc/xdr/src/libgfxdr.sym b/rpc/xdr/src/libgfxdr.sym
index 8af956ef5a9..83f1efc732a 100644
--- a/rpc/xdr/src/libgfxdr.sym
+++ b/rpc/xdr/src/libgfxdr.sym
@@ -155,8 +155,12 @@ xdr_gfs3_xattrop_req
xdr_gfs3_xattrop_rsp
xdr_gfs3_zerofill_req
xdr_gfs3_zerofill_rsp
+xdr_gfs4_entrylk_contention_req
+xdr_gfs4_entrylk_contention_rsp
xdr_gfs4_icreate_req
xdr_gfs4_icreate_rsp
+xdr_gfs4_inodelk_contention_req
+xdr_gfs4_inodelk_contention_rsp
xdr_gfs4_namelink_req
xdr_gfs4_namelink_rsp
xdr_gf_set_lk_ver_req
diff --git a/tests/basic/ec/lock-contention.t b/tests/basic/ec/lock-contention.t
new file mode 100644
index 00000000000..8f86cee16ad
--- /dev/null
+++ b/tests/basic/ec/lock-contention.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This test verifies that when 'lock-notify-contention' option is enabled,
+# locks xlator actually sends an upcall notification that causes the acquired
+# lock from one client to be released before it's supposed to when another
+# client accesses the file.
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function elapsed_time() {
+ local start="`date +%s`"
+
+ if [[ "test" == `cat "$1"` ]]; then
+ echo "$((`date +%s` - ${start}))"
+ fi
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 features.locks-notify-contention off
+TEST $CLI volume set $V0 disperse.eager-lock on
+TEST $CLI volume set $V0 disperse.eager-lock-timeout 6
+TEST $CLI volume set $V0 disperse.other-eager-lock on
+TEST $CLI volume set $V0 disperse.other-eager-lock-timeout 6
+TEST $CLI volume start $V0
+
+TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0
+
+TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1
+
+TEST $(echo "test" >$M0/file)
+
+# With locks-notify-contention set to off, accessing the file from another
+# client should take 6 seconds. Checking against 3 seconds to be safe.
+elapsed="$(elapsed_time $M1/file)"
+TEST [[ ${elapsed} -ge 3 ]]
+
+elapsed="$(elapsed_time $M0/file)"
+TEST [[ ${elapsed} -ge 3 ]]
+
+TEST $CLI volume set $V0 features.locks-notify-contention on
+
+# With locks-notify-contention set to on, accessing the file from another
+# client should be fast. Checking against 3 seconds to be safe.
+elapsed="$(elapsed_time $M1/file)"
+TEST [[ ${elapsed} -le 3 ]]
+
+elapsed="$(elapsed_time $M0/file)"
+TEST [[ ${elapsed} -le 3 ]]
+
+cleanup
diff --git a/tests/volume.rc b/tests/volume.rc
index 3a48b43d2e0..3ee83624058 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -93,7 +93,8 @@ function remove_brick_status_completed_field {
function get_mount_process_pid {
local vol=$1
- ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1
+ local mnt=$2
+ ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1
}
function get_nfs_pid ()
@@ -126,7 +127,8 @@ function generate_statedump {
function generate_mount_statedump {
local vol=$1
- generate_statedump $(get_mount_process_pid $vol)
+ local mnt=$2
+ generate_statedump $(get_mount_process_pid $vol $mnt)
}
function cleanup_mount_statedump {
@@ -205,14 +207,16 @@ function ec_child_up_status {
local vol=$1
local dist_id=$2
local brick_id=$(($3 + 1))
- local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol))
+ local mnt=$4
+ local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol $mnt))
echo "${mask: -$brick_id:1}"
}
function ec_child_up_count {
local vol=$1
local dist_id=$2
- ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol)
+ local mnt=$3
+ ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol $mnt)
}
function ec_child_up_status_shd {
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index cb627a92c9c..fbc7ac97aa0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
return _gf_true;
}
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+ ec_lock_link_t *timer_link;
+
+ /* If we don't have any timer, there's nothing to cancel. */
+ if (lock->timer == NULL) {
+ return NULL;
+ }
+
+ /* We are trying to access a lock that has an unlock timer active.
+ * This means that the lock must be idle, i.e. no fop can be in the
+ * owner, waiting or frozen lists. It also means that the lock cannot
+ * have been marked as being released (this is done without timers).
+ * There should only be one owner reference, but it's possible that
+ * some fops are being prepared to use this lock. */
+ GF_ASSERT ((lock->refs_owners == 1) &&
+ list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+ /* We take the timer_link before cancelling the timer, since a
+ * successful cancellation will destroy it. It must not be NULL
+ * because it references the fop responsible for the delayed unlock
+ * that we are currently trying to cancel. */
+ timer_link = lock->timer->data;
+ GF_ASSERT(timer_link != NULL);
+
+ if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+ /* It's too late to avoid the execution of the timer callback.
+ * Since we need to be sure that the callback has access to all
+ * needed resources, we cannot resume the execution of the
+ * timer fop now. This will be done in the callback. */
+ timer_link = NULL;
+ } else {
+ /* The timer has been cancelled. The fop referenced by
+ * timer_link holds the last reference. The caller is
+ * responsible to release it when not needed anymore. */
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ }
+
+ /* We have two options here:
+ *
+ * 1. The timer has been successfully cancelled.
+ *
+ * This is the easiest case and we can continue with the currently
+ * acquired lock.
+ *
+ * 2. The timer callback has already been fired.
+ *
+ * In this case we have not been able to cancel the timer before
+ * the timer callback has been fired, but we also know that
+ * lock->timer != NULL. This means that the timer callback is still
+ * trying to acquire the inode mutex that we currently own. We are
+ * safe until we release it. In this case we can safely clear
+ * lock->timer. This will cause that the timer callback does nothing
+ * once it acquires the mutex.
+ */
+ lock->timer = NULL;
+
+ return timer_link;
+}
+
static gf_boolean_t
ec_lock_assign_owner(ec_lock_link_t *link)
{
@@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)
* empty. */
GF_ASSERT(list_empty(&lock->frozen));
- if (lock->timer != NULL) {
- /* We are trying to acquire a lock that has an unlock timer active.
- * This means that the lock must be idle, i.e. no fop can be in the
- * owner, waiting or frozen lists. It also means that the lock cannot
- * have been marked as being released (this is done without timers).
- * There should only be one owner reference, but it's possible that
- * some fops are being prepared to use this lock.
- */
- GF_ASSERT ((lock->refs_owners == 1) &&
- list_empty(&lock->owners) && list_empty(&lock->waiting));
-
- /* We take the timer_link before cancelling the timer, since a
- * successful cancellation will destroy it. It must not be NULL
- * because it references the fop responsible for the delayed unlock
- * that we are currently trying to cancel. */
- timer_link = lock->timer->data;
- GF_ASSERT(timer_link != NULL);
-
- if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
- /* It's too late to avoid the execution of the timer callback.
- * Since we need to be sure that the callback has access to all
- * needed resources, we cannot resume the execution of the timer
- * fop now. This will be done in the callback.
- */
- timer_link = NULL;
- } else {
- /* The timer has been cancelled, so we need to release the owner
- * reference that was held by the fop waiting for the timer. This
- * can be the last reference, but we'll immediately increment it
- * for the current fop, so no need to check it.
- */
- lock->refs_owners--;
-
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
- }
-
- /* We have two options here:
- *
- * 1. The timer has been successfully cancelled.
- *
- * This is the easiest case and we can continue with the currently
- * acquired lock.
- *
- * 2. The timer callback has already been fired.
- *
- * In this case we have not been able to cancel the timer before
- * the timer callback has been fired, but we also know that
- * lock->timer != NULL. This means that the timer callback is still
- * trying to acquire the inode mutex that we currently own. We are
- * safe until we release it. In this case we can safely clear
- * lock->timer. This will cause that the timer callback does nothing
- * once it acquires the mutex.
- */
- lock->timer = NULL;
- }
+ timer_link = ec_lock_timer_cancel(fop->xl, lock);
if (!list_empty(&lock->owners)) {
/* There are other owners of this lock. We can only take ownership if
@@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)
}
list_add_tail(&link->owner_list, &lock->owners);
- lock->refs_owners++;
+
+ /* If timer_link is not NULL, it means that we have inherited the owner
+ * reference assigned to the timer fop. In this case we simply reuse it.
+ * Otherwise we need to increase the number of owners. */
+ if (timer_link == NULL) {
+ lock->refs_owners++;
+ }
assigned = _gf_true;
@@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)
ec_resume(link->fop, 0);
}
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+ ec_lock_t *lock;
+ ec_inode_t *ctx;
+ ec_lock_link_t *timer_link = NULL;
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get(inode, ec->xl);
+ if (ctx == NULL) {
+ goto done;
+ }
+ lock = ctx->inode_lock;
+ if ((lock == NULL) || !lock->acquired || lock->release) {
+ goto done;
+ }
+
+ gf_msg_debug(ec->xl->name, 0,
+ "Releasing inode %p due to lock contention", inode);
+
+ /* The lock is not marked to be released, so the frozen list should be
+ * empty. */
+ GF_ASSERT(list_empty(&lock->frozen));
+
+ timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+ /* We mark the lock to be released as soon as possible. */
+ lock->release = _gf_true;
+
+done:
+ UNLOCK(&inode->lock);
+
+ /* If we have cancelled the timer, we need to start the unlock of the
+ * inode. If there was a timer but we have been unable to cancel it
+ * because it was just triggered, the timer callback will take care
+ * of releasing the inode. */
+ if (timer_link != NULL) {
+ ec_unlock_now(timer_link);
+ }
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link);
void
@@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)
ec_unlock_timer_del(data);
}
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+ /* Fops with no locks at this point mean that they are sent as sub-fops
+ * of other higher level fops. In this case we simply assume that the
+ * parent fop will take correct care of the eager lock. */
+ if (fop->lock_count == 0) {
+ return _gf_true;
+ }
+
+ /* We may have more than one lock, but this only happens in the rename
+ * fop, and both locks will reference an inode of the same type (a
+ * directory in this case), so we only need to check the first lock. */
+ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock;
+ }
+
+ return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ return ec->eager_lock_timeout;
+ }
+
+ return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+ struct timespec delay;
+ ec_fop_data_t *fop = link->fop;
+ ec_lock_t *lock = link->lock;
+
+ delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+ delay.tv_nsec = 0;
+ lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+ ec_unlock_timer_cbk, link);
+ if (lock->timer == NULL) {
+ gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+ EC_MSG_UNLOCK_DELAY_FAILED,
+ "Unable to delay an unlock");
+
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
void ec_unlock_timer_add(ec_lock_link_t *link)
{
- struct timespec delay;
ec_fop_data_t *fop = link->fop;
ec_lock_t *lock = link->lock;
gf_boolean_t now = _gf_false;
@@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
lock->release);
- delay.tv_sec = 1;
- delay.tv_nsec = 0;
- lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
- ec_unlock_timer_cbk, link);
- if (lock->timer == NULL) {
- gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
- EC_MSG_UNLOCK_DELAY_FAILED,
- "Unable to delay an unlock");
-
+ if (!ec_lock_delay_create(link)) {
/* We are unable to create a new timer. We immediately release
* the lock. */
lock->release = now = _gf_true;
}
+
} else {
ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
lock->release);
@@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)
ec_update_info(&fop->locks[0]);
}
-static gf_boolean_t
-ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
-{
- /* Fops with no locks at this point mean that they are sent as sub-fops
- * of other higher level fops. In this case we simply assume that the
- * parent fop will take correct care of the eager lock. */
- if (fop->lock_count == 0) {
- return _gf_true;
- }
-
- /* We may have more than one lock, but this only happens in the rename
- * fop, and both locks will reference an inode of the same type (a
- * directory in this case), so we only need to check the first lock. */
- if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
- return ec->eager_lock;
- }
-
- return ec->other_eager_lock;
-}
-
static void
ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
ec_fop_data_t *fop)
@@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
ec = fop->xl->private;
cbk = fop->answer;
- if (ec_use_eager_lock(ec, fop) && cbk != NULL) {
+ if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
if (cbk->xdata != NULL) {
if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
&count) == 0) && (count > 1)) {
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index c3e291585ef..99e2f0653be 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
+void ec_lock_release(ec_t *ec, inode_t *inode);
gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
uint64_t *size);
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23dc434bc42..15b4c77abfe 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -669,6 +669,8 @@ struct _ec {
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
+ uint32_t eager_lock_timeout;
+ uint32_t other_eager_lock_timeout;
struct list_head pending_fops;
struct list_head heal_waiting;
struct list_head healing;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 4c80f1283f1..30b0bdcb29c 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options)
bool, failed);
GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options,
bool, failed);
+ GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout,
+ options, uint32, failed);
+ GF_OPTION_RECONF ("other-eager-lock-timeout",
+ ec->other_eager_lock_timeout, options, uint32,
+ failed);
GF_OPTION_RECONF ("background-heals", background_heals, options,
uint32, failed);
GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options,
@@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
}
}
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *ci = NULL;
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ inode_t *inode;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_CACHE_INVALIDATION:
+ ci = upcall->data;
+ ci->flags |= UP_INVAL_ATTR;
+ return _gf_true;
+
+ case GF_UPCALL_INODELK_CONTENTION:
+ lc = upcall->data;
+ if (strcmp(lc->domain, ec->xl->name) != 0) {
+ /* The lock is not owned by EC, ignore it. */
+ return _gf_true;
+ }
+ inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable,
+ upcall->gfid);
+ /* If inode is not found, it means that it's already released,
+ * so we can ignore it. Probably it has been released and
+ * destroyed while the contention notification was being sent.
+ */
+ if (inode != NULL) {
+ ec_lock_release(ec, inode);
+ inode_unref(inode);
+ }
+
+ return _gf_false;
+
+ default:
+ return _gf_true;
+ }
+}
+
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
dict_t *output = NULL;
gf_boolean_t propagate = _gf_true;
int32_t orig_event = event;
- struct gf_upcall *up_data = NULL;
- struct gf_upcall_cache_invalidation *up_ci = NULL;
uintptr_t mask = 0;
gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
event, data, data2);
if (event == GF_EVENT_UPCALL) {
- up_data = (struct gf_upcall *)data;
- if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) {
- up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
- up_ci->flags |= UP_INVAL_ATTR;
- }
+ propagate = ec_upcall(ec, data);
goto done;
}
@@ -664,6 +700,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed);
GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed);
+ GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+ failed);
+ GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+ uint32, failed);
GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed);
GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32,
@@ -1456,6 +1496,29 @@ struct volume_options options[] =
.description = "It's equivalent to the eager-lock option but for non "
"regular files."
},
+ { .key = {"eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "Maximum time (in seconds) that a lock on an inode is "
+ "kept held if no new operations on the inode are "
+ "received."
+ },
+ { .key = {"other-eager-lock-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 60,
+ .default_value = "1",
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+ .tags = { "disperse", "locks", "timeout" },
+ .description = "It's equivalent ot eager-lock-timeout option but for "
+ "non regular files."
+ },
{ .key = {"background-heals"},
.type = GF_OPTION_TYPE_INT,
.min = 0,/*Disabling background heals*/
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index a76d6beacb1..1609fc416d2 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -200,6 +200,7 @@ int
clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
clrlk_args *args, int *blkd, int *granted, int *op_errno)
{
+ posix_locks_private_t *priv;
pl_inode_lock_t *ilock = NULL;
pl_inode_lock_t *tmp = NULL;
struct gf_flock ulock = {0, };
@@ -207,9 +208,20 @@ clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
int bcount = 0;
int gcount = 0;
gf_boolean_t chk_range = _gf_false;
+ struct list_head *pcontend = NULL;
struct list_head released;
+ struct list_head contend;
+ struct timespec now = { };
INIT_LIST_HEAD (&released);
+
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) {
*op_errno = EINVAL;
goto out;
@@ -283,7 +295,10 @@ granted:
ret = 0;
out:
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now, pcontend);
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
*blkd = bcount;
*granted = gcount;
return ret;
@@ -294,15 +309,27 @@ int
clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
clrlk_args *args, int *blkd, int *granted, int *op_errno)
{
+ posix_locks_private_t *priv;
pl_entry_lock_t *elock = NULL;
pl_entry_lock_t *tmp = NULL;
int bcount = 0;
int gcount = 0;
int ret = -1;
+ struct list_head *pcontend = NULL;
struct list_head removed;
struct list_head released;
+ struct list_head contend;
+ struct timespec now;
INIT_LIST_HEAD (&released);
+
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
if (args->kind & CLRLK_BLOCKED)
goto blkd;
@@ -361,12 +388,15 @@ granted:
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
- __pl_entrylk_unref (elock);
+ __pl_entrylk_unref (elock);
}
}
pthread_mutex_unlock (&pl_inode->mutex);
- grant_blocked_entry_locks (this, pl_inode, dom);
+ grant_blocked_entry_locks (this, pl_inode, dom, &now, pcontend);
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
ret = 0;
out:
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 3729ca24bed..50c156feb38 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -69,7 +69,11 @@ get_domain (pl_inode_t *pl_inode, const char *volume);
void
grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom);
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend);
+
+void
+inodelk_contention_notify (xlator_t *this, struct list_head *contend);
void
__delete_inode_lock (pl_inode_lock_t *lock);
@@ -79,7 +83,11 @@ __pl_inodelk_unref (pl_inode_lock_t *lock);
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom);
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend);
+
+void
+entrylk_contention_notify (xlator_t *this, struct list_head *contend);
void pl_update_refkeeper (xlator_t *this, inode_t *inode);
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index 6698516fc83..008d05a34c4 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -13,17 +13,19 @@
#include "logging.h"
#include "common-utils.h"
#include "list.h"
+#include "upcall-utils.h"
#include "locks.h"
#include "clear.h"
#include "common.h"
+#include "pl-messages.h"
void
__pl_entrylk_unref (pl_entry_lock_t *lock)
{
lock->ref--;
if (!lock->ref) {
- GF_FREE ((char *)lock->basename);
+ GF_FREE ((char *)lock->basename);
GF_FREE (lock->connection_id);
GF_FREE (lock);
}
@@ -39,7 +41,7 @@ __pl_entrylk_ref (pl_entry_lock_t *lock)
static pl_entry_lock_t *
new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
- const char *domain, call_frame_t *frame, char *conn_id)
+ const char *domain, call_frame_t *frame, char *conn_id)
{
pl_entry_lock_t *newlock = NULL;
@@ -55,7 +57,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
newlock->client_pid = frame->root->pid;
newlock->volume = domain;
newlock->owner = frame->root->lk_owner;
- newlock->frame = frame;
+ newlock->frame = frame;
newlock->this = frame->this;
if (conn_id) {
@@ -64,9 +66,9 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
INIT_LIST_HEAD (&newlock->domain_list);
INIT_LIST_HEAD (&newlock->blocked_locks);
- INIT_LIST_HEAD (&newlock->client_list);
+ INIT_LIST_HEAD (&newlock->client_list);
- __pl_entrylk_ref (newlock);
+ __pl_entrylk_ref (newlock);
out:
return newlock;
}
@@ -201,6 +203,113 @@ out:
return revoke_lock;
}
+static gf_boolean_t
+__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+ struct timespec *now)
+{
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+
+ priv = this->private;
+
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+ return _gf_false;
+ }
+
+ elapsed = now->tv_sec;
+ elapsed -= lock->contention_time.tv_sec;
+ if (now->tv_nsec < lock->contention_time.tv_nsec) {
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+ return _gf_false;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+ * region. This means that currently granted locks might have already
+ * been unlocked by that time. To avoid the lock or the inode to be
+ * destroyed before we process them, we take an additional reference
+ * on both. */
+ inode_ref(lock->pinode->inode);
+ __pl_entrylk_ref(lock);
+
+ lock->contention_time = *now;
+
+ return _gf_true;
+}
+
+void
+entrylk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+ struct gf_upcall up;
+ struct gf_upcall_entrylk_contention lc;
+ pl_entry_lock_t *lock;
+ pl_inode_t *pl_inode;
+ client_t *client;
+ gf_boolean_t notify;
+
+ while (!list_empty(contend)) {
+ lock = list_first_entry(contend, pl_entry_lock_t, contend);
+
+ pl_inode = lock->pinode;
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ /* If the lock has already been released, no notification is
+ * sent. We clear the notification time in this case. */
+ notify = !list_empty(&lock->domain_list);
+ if (!notify) {
+ lock->contention_time.tv_sec = 0;
+ lock->contention_time.tv_nsec = 0;
+ } else {
+ lc.type = lock->type;
+ lc.name = lock->basename;
+ lc.pid = lock->client_pid;
+ lc.domain = lock->volume;
+ lc.xdata = NULL;
+
+ gf_uuid_copy(up.gfid, lock->pinode->gfid);
+ client = (client_t *)lock->client;
+ if (client == NULL) {
+ /* A NULL client can be found if the entrylk
+ * was issued by a server side xlator. */
+ up.client_uid = NULL;
+ } else {
+ up.client_uid = client->client_uid;
+ }
+ }
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ if (notify) {
+ up.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+ up.data = &lc;
+
+ if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+ gf_msg_debug(this->name, 0,
+ "Entrylk contention notification "
+ "failed");
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Entrylk contention notification "
+ "sent");
+ }
+ }
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ list_del_init(&lock->contend);
+ __pl_entrylk_unref(lock);
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ inode_unref(pl_inode->inode);
+ }
+}
+
+
/**
* entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
@@ -208,19 +317,27 @@ out:
* @type: type of lock
*/
static pl_entry_lock_t *
-__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock)
+__entrylk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock,
+ struct timespec *now, struct list_head *contend)
{
pl_entry_lock_t *tmp = NULL;
-
- if (list_empty (&dom->entrylk_list))
- return NULL;
+ pl_entry_lock_t *ret = NULL;
list_for_each_entry (tmp, &dom->entrylk_list, domain_list) {
- if (__conflicting_entrylks (tmp, lock))
- return tmp;
+ if (__conflicting_entrylks (tmp, lock)) {
+ if (ret == NULL) {
+ ret = tmp;
+ if (contend == NULL) {
+ break;
+ }
+ }
+ if (__entrylk_needs_contention_notify(this, tmp, now)) {
+ list_add_tail(&tmp->contend, contend);
+ }
+ }
}
- return NULL;
+ return ret;
}
static pl_entry_lock_t *
@@ -228,9 +345,6 @@ __blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
pl_entry_lock_t *tmp = NULL;
- if (list_empty (&dom->blocked_entrylks))
- return NULL;
-
list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) {
if (names_conflict (tmp->basename, lock->basename))
return lock;
@@ -426,6 +540,27 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)
return NULL;
}
+static int
+__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_entry_lock_t *lock, int nonblock)
+{
+ struct timeval now;
+
+ gettimeofday(&now, NULL);
+
+ if (nonblock)
+ goto out;
+
+ lock->blkd_time = now;
+ list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
+
+ gf_msg_trace (this->name, 0, "Blocking lock: {pinode=%p, basename=%s}",
+ pinode, lock->basename);
+
+out:
+ return -EAGAIN;
+}
+
/**
* __lock_entrylk - lock a name in a directory
* @inode: inode for the directory in which to lock
@@ -439,24 +574,15 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock)
int
__lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
- int nonblock, pl_dom_list_t *dom)
+ int nonblock, pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
pl_entry_lock_t *conf = NULL;
int ret = -EAGAIN;
- conf = __entrylk_grantable (dom, lock);
+ conf = __entrylk_grantable (this, dom, lock, now, contend);
if (conf) {
- ret = -EAGAIN;
- if (nonblock)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "Blocking lock: {pinode=%p, basename=%s}",
- pinode, lock->basename);
-
+ ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
goto out;
}
@@ -471,20 +597,15 @@ __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
* granted, without which self-heal can't progress.
* TODO: Find why 'owner_has_lock' is checked even for blocked locks.
*/
- if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
- ret = -EAGAIN;
- if (nonblock)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Lock is grantable, but blocking to prevent starvation");
- gf_log (this->name, GF_LOG_TRACE,
- "Blocking lock: {pinode=%p, basename=%s}",
- pinode, lock->basename);
+ if (__blocked_entrylk_conflict (dom, lock) &&
+ !(__owner_has_lock (dom, lock))) {
+ if (nonblock == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock is grantable, but blocking to prevent "
+ "starvation");
+ }
+ ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
goto out;
}
@@ -551,7 +672,8 @@ out:
void
__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom, struct list_head *granted)
+ pl_dom_list_t *dom, struct list_head *granted,
+ struct timespec *now, struct list_head *contend)
{
int bl_ret = 0;
pl_entry_lock_t *bl = NULL;
@@ -566,7 +688,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
list_del_init (&bl->blocked_locks);
- bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom);
+ bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom, now,
+ contend);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
@@ -578,7 +701,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grants locks if possible which are blocked on a lock */
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom)
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
struct list_head granted_list;
pl_entry_lock_t *tmp = NULL;
@@ -589,7 +713,7 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
__grant_blocked_entry_locks (this, pl_inode, dom,
- &granted_list);
+ &granted_list, now, contend);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -610,8 +734,6 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
}
}
pthread_mutex_unlock (&pl_inode->mutex);
-
- return;
}
@@ -637,9 +759,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
int nonblock = 0;
gf_boolean_t need_inode_unref = _gf_false;
posix_locks_private_t *priv = NULL;
+ struct list_head *pcontend = NULL;
+ struct list_head contend;
+ struct timespec now = { };
priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD(pcontend);
+ timespec_now(&now);
+ }
+
if (xdata)
dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
@@ -722,7 +853,8 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
{
reqlock->pinode = pinode;
- ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom);
+ ret = __lock_entrylk (this, pinode, reqlock, nonblock,
+ dom, &now, pcontend);
if (ret == 0) {
reqlock->frame = NULL;
op_ret = 0;
@@ -778,7 +910,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
if (ctx)
pthread_mutex_unlock (&ctx->lock);
- grant_blocked_entry_locks (this, pinode, dom);
+ grant_blocked_entry_locks (this, pinode, dom, &now, pcontend);
break;
@@ -810,6 +942,10 @@ unwind:
cmd, type);
}
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
+
return 0;
}
@@ -868,27 +1004,37 @@ pl_entrylk_log_cleanup (pl_entry_lock_t *lock)
int
pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
+ posix_locks_private_t *priv;
pl_entry_lock_t *tmp = NULL;
pl_entry_lock_t *l = NULL;
- pl_dom_list_t *dom = NULL;
+ pl_dom_list_t *dom = NULL;
pl_inode_t *pinode = NULL;
-
+ struct list_head *pcontend = NULL;
struct list_head released;
struct list_head unwind;
+ struct list_head contend;
+ struct timespec now = { };
INIT_LIST_HEAD (&released);
INIT_LIST_HEAD (&unwind);
- pthread_mutex_lock (&ctx->lock);
+ priv = this->private;
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
+ pthread_mutex_lock (&ctx->lock);
{
list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,
- client_list) {
- pl_entrylk_log_cleanup (l);
+ client_list) {
+ pl_entrylk_log_cleanup (l);
- pinode = l->pinode;
+ pinode = l->pinode;
- pthread_mutex_lock (&pinode->mutex);
- {
+ pthread_mutex_lock (&pinode->mutex);
+ {
/* If the entrylk object is part of granted list but not
* blocked list, then perform the following actions:
* i. delete the object from granted list;
@@ -931,38 +1077,42 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
&unwind);
}
}
- pthread_mutex_unlock (&pinode->mutex);
+ pthread_mutex_unlock (&pinode->mutex);
}
- }
+ }
pthread_mutex_unlock (&ctx->lock);
list_for_each_entry_safe (l, tmp, &unwind, client_list) {
list_del_init (&l->client_list);
- if (l->frame)
- STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
- NULL);
+ if (l->frame)
+ STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
+ NULL);
list_add_tail (&l->client_list, &released);
}
list_for_each_entry_safe (l, tmp, &released, client_list) {
list_del_init (&l->client_list);
- pinode = l->pinode;
+ pinode = l->pinode;
- dom = get_domain (pinode, l->volume);
+ dom = get_domain (pinode, l->volume);
- grant_blocked_entry_locks (this, pinode, dom);
+ grant_blocked_entry_locks (this, pinode, dom, &now, pcontend);
- pthread_mutex_lock (&pinode->mutex);
- {
- __pl_entrylk_unref (l);
- }
- pthread_mutex_unlock (&pinode->mutex);
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ __pl_entrylk_unref (l);
+ }
+ pthread_mutex_unlock (&pinode->mutex);
inode_unref (pinode->inode);
}
+ if (pcontend != NULL) {
+ entrylk_contention_notify(this, pcontend);
+ }
+
return 0;
}
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 64ffb00c18c..890ac8b6d00 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -13,10 +13,12 @@
#include "logging.h"
#include "common-utils.h"
#include "list.h"
+#include "upcall-utils.h"
#include "locks.h"
#include "clear.h"
#include "common.h"
+#include "pl-messages.h"
void
__delete_inode_lock (pl_inode_lock_t *lock)
@@ -229,22 +231,134 @@ out:
return revoke_lock;
}
+static gf_boolean_t
+__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+ struct timespec *now)
+{
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+
+ priv = this->private;
+
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+ return _gf_false;
+ }
+
+ elapsed = now->tv_sec;
+ elapsed -= lock->contention_time.tv_sec;
+ if (now->tv_nsec < lock->contention_time.tv_nsec) {
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+ return _gf_false;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+ * region. This means that currently granted locks might have already
+ * been unlocked by that time. To avoid the lock or the inode to be
+ * destroyed before we process them, we take an additional reference
+ * on both. */
+ inode_ref(lock->pl_inode->inode);
+ __pl_inodelk_ref(lock);
+
+ lock->contention_time = *now;
+
+ return _gf_true;
+}
+
+void
+inodelk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+ struct gf_upcall up;
+ struct gf_upcall_inodelk_contention lc;
+ pl_inode_lock_t *lock;
+ pl_inode_t *pl_inode;
+ client_t *client;
+ gf_boolean_t notify;
+
+ while (!list_empty(contend)) {
+ lock = list_first_entry(contend, pl_inode_lock_t, contend);
+
+ pl_inode = lock->pl_inode;
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ /* If the lock has already been released, no notification is
+ * sent. We clear the notification time in this case. */
+ notify = !list_empty(&lock->list);
+ if (!notify) {
+ lock->contention_time.tv_sec = 0;
+ lock->contention_time.tv_nsec = 0;
+ } else {
+ memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock));
+ lc.pid = lock->client_pid;
+ lc.domain = lock->volume;
+ lc.xdata = NULL;
+
+ gf_uuid_copy(up.gfid, lock->pl_inode->gfid);
+ client = (client_t *)lock->client;
+ if (client == NULL) {
+ /* A NULL client can be found if the inodelk
+ * was issued by a server side xlator. */
+ up.client_uid = NULL;
+ } else {
+ up.client_uid = client->client_uid;
+ }
+ }
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ if (notify) {
+ up.event_type = GF_UPCALL_INODELK_CONTENTION;
+ up.data = &lc;
+
+ if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+ gf_msg_debug(this->name, 0,
+ "Inodelk contention notification "
+ "failed");
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Inodelk contention notification "
+ "sent");
+ }
+ }
+
+ pthread_mutex_lock(&pl_inode->mutex);
+
+ list_del_init(&lock->contend);
+ __pl_inodelk_unref(lock);
+
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+ inode_unref(pl_inode->inode);
+ }
+}
+
/* Determine if lock is grantable or not */
static pl_inode_lock_t *
-__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)
+__inodelk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+ struct timespec *now, struct list_head *contend)
{
pl_inode_lock_t *l = NULL;
pl_inode_lock_t *ret = NULL;
- if (list_empty (&dom->inodelk_list))
- goto out;
+
list_for_each_entry (l, &dom->inodelk_list, list){
if (inodelk_conflict (lock, l) &&
!same_inodelk_owner (lock, l)) {
- ret = l;
- goto out;
+ if (ret == NULL) {
+ ret = l;
+ if (contend == NULL) {
+ break;
+ }
+ }
+ if (__inodelk_needs_contention_notify(this, l, now)) {
+ list_add_tail(&l->contend, contend);
+ }
}
}
-out:
+
return ret;
}
@@ -252,20 +366,14 @@ static pl_inode_lock_t *
__blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock)
{
pl_inode_lock_t *l = NULL;
- pl_inode_lock_t *ret = NULL;
-
- if (list_empty (&dom->blocked_inodelks))
- return NULL;
list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) {
if (inodelk_conflict (lock, l)) {
- ret = l;
- goto out;
+ return l;
}
}
-out:
- return ret;
+ return NULL;
}
static int
@@ -286,35 +394,45 @@ __owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock)
return 0;
}
+static int
+__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+ int can_block)
+{
+ struct timeval now;
+
+ gettimeofday(&now, NULL);
+
+ if (can_block == 0) {
+ goto out;
+ }
+
+ lock->blkd_time = now;
+ list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
+
+ gf_msg_trace (this->name, 0, "%s (pid=%d) (lk-owner=%s) %"PRId64" - "
+ "%"PRId64" => Blocked",
+ lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+ lock->client_pid, lkowner_utoa (&lock->owner),
+ lock->user_flock.l_start, lock->user_flock.l_len);
+
+out:
+ return -EAGAIN;
+}
/* Determines if lock can be granted and adds the lock. If the lock
* is blocking, adds it to the blocked_inodelks list of the domain.
*/
static int
__lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
- int can_block, pl_dom_list_t *dom)
+ int can_block, pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
pl_inode_lock_t *conf = NULL;
int ret = -EINVAL;
- conf = __inodelk_grantable (dom, lock);
+ conf = __inodelk_grantable (this, dom, lock, now, contend);
if (conf) {
- ret = -EAGAIN;
- if (can_block == 0)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked",
- lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
- lock->client_pid,
- lkowner_utoa (&lock->owner),
- lock->user_flock.l_start,
- lock->user_flock.l_len);
-
-
+ ret = __lock_blocked_add(this, dom, lock, can_block);
goto out;
}
@@ -330,25 +448,15 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
* will not be unlocked by SHD from Machine1.
* TODO: Find why 'owner_has_lock' is checked even for blocked locks.
*/
- if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
- ret = -EAGAIN;
- if (can_block == 0)
- goto out;
-
- gettimeofday (&lock->blkd_time, NULL);
- list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Lock is grantable, but blocking to prevent starvation");
- gf_log (this->name, GF_LOG_TRACE,
- "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked",
- lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
- lock->client_pid,
- lkowner_utoa (&lock->owner),
- lock->user_flock.l_start,
- lock->user_flock.l_len);
-
+ if (__blocked_lock_conflict (dom, lock) &&
+ !(__owner_has_lock (dom, lock))) {
+ if (can_block != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock is grantable, but blocking to prevent "
+ "starvation");
+ }
+ ret = __lock_blocked_add(this, dom, lock, can_block);
goto out;
}
__pl_inodelk_ref (lock);
@@ -417,7 +525,8 @@ out:
static void
__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- struct list_head *granted, pl_dom_list_t *dom)
+ struct list_head *granted, pl_dom_list_t *dom,
+ struct timespec *now, struct list_head *contend)
{
int bl_ret = 0;
pl_inode_lock_t *bl = NULL;
@@ -432,7 +541,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
list_del_init (&bl->blocked_locks);
- bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom);
+ bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom, now,
+ contend);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
@@ -444,7 +554,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grant all inodelks blocked on a lock */
void
grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_dom_list_t *dom)
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend)
{
struct list_head granted;
pl_inode_lock_t *lock;
@@ -454,7 +565,8 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
- __grant_blocked_inode_locks (this, pl_inode, &granted, dom);
+ __grant_blocked_inode_locks (this, pl_inode, &granted, dom,
+ now, contend);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -471,7 +583,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
&lock->user_flock, 0, 0, lock->volume);
STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL);
- lock->frame = NULL;
+ lock->frame = NULL;
}
pthread_mutex_lock (&pl_inode->mutex);
@@ -488,9 +600,9 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
static void
pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
{
- pl_inode_t *pl_inode = NULL;
+ pl_inode_t *pl_inode = NULL;
- pl_inode = lock->pl_inode;
+ pl_inode = lock->pl_inode;
gf_log (THIS->name, GF_LOG_WARNING, "releasing lock on %s held by "
"{client=%p, pid=%"PRId64" lk-owner=%s}",
@@ -503,27 +615,38 @@ pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
int
pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
+ posix_locks_private_t *priv;
pl_inode_lock_t *tmp = NULL;
pl_inode_lock_t *l = NULL;
- pl_dom_list_t *dom = NULL;
+ pl_dom_list_t *dom = NULL;
pl_inode_t *pl_inode = NULL;
-
+ struct list_head *pcontend = NULL;
struct list_head released;
struct list_head unwind;
+ struct list_head contend;
+ struct timespec now = { };
+
+ priv = this->private;
INIT_LIST_HEAD (&released);
INIT_LIST_HEAD (&unwind);
- pthread_mutex_lock (&ctx->lock);
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD (pcontend);
+ timespec_now(&now);
+ }
+
+ pthread_mutex_lock (&ctx->lock);
{
list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
- client_list) {
- pl_inodelk_log_cleanup (l);
+ client_list) {
+ pl_inodelk_log_cleanup (l);
- pl_inode = l->pl_inode;
+ pl_inode = l->pl_inode;
- pthread_mutex_lock (&pl_inode->mutex);
- {
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
/* If the inodelk object is part of granted list but not
* blocked list, then perform the following actions:
* i. delete the object from granted list;
@@ -567,45 +690,49 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
&unwind);
}
}
- pthread_mutex_unlock (&pl_inode->mutex);
+ pthread_mutex_unlock (&pl_inode->mutex);
}
- }
+ }
pthread_mutex_unlock (&ctx->lock);
list_for_each_entry_safe (l, tmp, &unwind, client_list) {
list_del_init (&l->client_list);
if (l->frame)
- STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
- NULL);
+ STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
+ NULL);
list_add_tail (&l->client_list, &released);
-
}
list_for_each_entry_safe (l, tmp, &released, client_list) {
list_del_init (&l->client_list);
- pl_inode = l->pl_inode;
+ pl_inode = l->pl_inode;
- dom = get_domain (pl_inode, l->volume);
+ dom = get_domain (pl_inode, l->volume);
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now,
+ pcontend);
- pthread_mutex_lock (&pl_inode->mutex);
- {
- __pl_inodelk_unref (l);
- }
- pthread_mutex_unlock (&pl_inode->mutex);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ __pl_inodelk_unref (l);
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
inode_unref (pl_inode->inode);
}
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
+
return 0;
}
static int
pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
- pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
+ pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
inode_t *inode)
{
posix_locks_private_t *priv = NULL;
@@ -613,9 +740,12 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
gf_boolean_t need_inode_unref = _gf_false;
+ struct list_head *pcontend = NULL;
+ struct list_head contend;
+ struct timespec now = { };
short fl_type;
- lock->pl_inode = pl_inode;
+ lock->pl_inode = pl_inode;
fl_type = lock->fl_type;
priv = this->private;
@@ -657,12 +787,19 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
}
}
+ if (priv->notify_contention) {
+ pcontend = &contend;
+ INIT_LIST_HEAD(pcontend);
+ timespec_now(&now);
+ }
+
if (ctx)
pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
{
if (lock->fl_type != F_UNLCK) {
- ret = __lock_inodelk (this, pl_inode, lock, can_block, dom);
+ ret = __lock_inodelk (this, pl_inode, lock, can_block,
+ dom, &now, pcontend);
if (ret == 0) {
lock->frame = NULL;
gf_log (this->name, GF_LOG_TRACE,
@@ -725,13 +862,18 @@ out:
*/
if ((fl_type == F_UNLCK) && (ret == 0)) {
inode_unref (pl_inode->inode);
- grant_blocked_inode_locks (this, pl_inode, dom);
+ grant_blocked_inode_locks (this, pl_inode, dom, &now,
+ pcontend);
}
if (need_inode_unref) {
inode_unref (pl_inode->inode);
}
+ if (pcontend != NULL) {
+ inodelk_contention_notify(this, pcontend);
+ }
+
return ret;
}
@@ -771,7 +913,8 @@ new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
INIT_LIST_HEAD (&lock->list);
INIT_LIST_HEAD (&lock->blocked_locks);
- INIT_LIST_HEAD (&lock->client_list);
+ INIT_LIST_HEAD (&lock->client_list);
+ INIT_LIST_HEAD (&lock->contend);
__pl_inodelk_ref (lock);
return lock;
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index 3d3b327f56c..c2edfff8f00 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -70,6 +70,7 @@ typedef struct __posix_lock posix_lock_t;
struct __pl_inode_lock {
struct list_head list;
struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */
+ struct list_head contend; /* list of contending locks */
int ref;
short fl_type;
@@ -86,6 +87,8 @@ struct __pl_inode_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
+ /*last time at wich lock contention was detected and notified*/
+ struct timespec contention_time;
/* These two together serve to uniquely identify each process
across nodes */
@@ -120,6 +123,7 @@ typedef struct _pl_dom_list pl_dom_list_t;
struct __entry_lock {
struct list_head domain_list; /* list_head back to pl_dom_list_t */
struct list_head blocked_locks; /* list_head back to blocked_entrylks */
+ struct list_head contend; /* list of contending locks */
int ref;
call_frame_t *frame;
@@ -133,6 +137,8 @@ struct __entry_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
+ /*last time at wich lock contention was detected and notified*/
+ struct timespec contention_time;
void *client;
gf_lkowner_t owner;
@@ -194,6 +200,8 @@ typedef struct {
uint32_t revocation_secs;
gf_boolean_t revocation_clear_all;
uint32_t revocation_max_blocked;
+ gf_boolean_t notify_contention;
+ uint32_t notify_contention_delay;
} posix_locks_private_t;
diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h
index 7a1e3f488e7..e5a276f35b5 100644
--- a/xlators/features/locks/src/pl-messages.h
+++ b/xlators/features/locks/src/pl-messages.h
@@ -24,7 +24,9 @@
*/
GLFS_MSGID(PL,
- PL_MSG_LOCK_NUMBER
+ PL_MSG_LOCK_NUMBER,
+ PL_MSG_INODELK_CONTENTION_FAILED,
+ PL_MSG_ENTRYLK_CONTENTION_FAILED
);
#endif /* !_PL_MESSAGES_H_ */
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 78bf160058c..82d77db1164 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -3641,6 +3641,13 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("revocation-max-blocked",
priv->revocation_max_blocked, options,
uint32, out);
+
+ GF_OPTION_RECONF ("notify-contention", priv->notify_contention,
+ options, bool, out);
+
+ GF_OPTION_RECONF ("notify-contention-delay",
+ priv->notify_contention_delay, options, uint32, out);
+
ret = 0;
out:
@@ -3705,6 +3712,12 @@ init (xlator_t *this)
GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked,
uint32, out);
+ GF_OPTION_INIT ("notify-contention", priv->notify_contention, bool,
+ out);
+
+ GF_OPTION_INIT ("notify-contention-delay",
+ priv->notify_contention_delay, uint32, out);
+
this->local_pool = mem_pool_new (pl_local_t, 32);
if (!this->local_pool) {
ret = -1;
@@ -4461,5 +4474,32 @@ struct volume_options options[] = {
"will be revoked to allow the others to proceed. Can "
"be used in conjunction w/ revocation-clear-all."
},
+ { .key = {"notify-contention"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .tags = { "locks", "contention" },
+ .description = "When this option is enabled and a lock request "
+ "conflicts with a currently granted lock, an upcall "
+ "notification will be sent to the current owner of "
+ "the lock to request it to be released as soon as "
+ "possible."
+ },
+ { .key = {"notify-contention-delay"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0, /* An upcall notification is sent every time a conflict is
+ * detected. */
+ .max = 60,
+ .default_value = "5",
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .op_version = { GD_OP_VERSION_4_0_0 },
+ .tags = { "locks", "contention", "timeout" },
+ .description = "This value determines the minimum amount of time "
+ "(in seconds) between upcall contention notifications "
+ "on the same inode. If multiple lock requests are "
+ "received during this period, only one upcall will "
+ "be sent."
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index da85cb5297f..e0ec3368ca7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1484,6 +1484,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_13_0,
.flags = VOLOPT_FLAG_CLIENT_OPT
},
+ { .key = "disperse.eager-lock-timeout",
+ .voltype = "cluster/disperse",
+ .op_version = GD_OP_VERSION_4_0_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
+ { .key = "disperse.other-eager-lock-timeout",
+ .voltype = "cluster/disperse",
+ .op_version = GD_OP_VERSION_4_0_0,
+ .flags = VOLOPT_FLAG_CLIENT_OPT
+ },
{ .key = "cluster.quorum-type",
.voltype = "cluster/replicate",
.option = "quorum-type",
@@ -3489,6 +3499,16 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_9_0,
.type = NO_DOC,
},
+ { .option = "notify-contention",
+ .key = "features.locks-notify-contention",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_4_0_0,
+ },
+ { .option = "notify-contention-delay",
+ .key = "features.locks-notify-contention-delay",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_4_0_0,
+ },
{ .key = "disperse.shd-max-threads",
.voltype = "cluster/disperse",
.op_version = GD_OP_VERSION_3_9_0,
diff --git a/xlators/protocol/client/src/client-callback.c b/xlators/protocol/client/src/client-callback.c
index 51164e57230..b2f9a225887 100644
--- a/xlators/protocol/client/src/client-callback.c
+++ b/xlators/protocol/client/src/client-callback.c
@@ -173,6 +173,101 @@ out:
return 0;
}
+int
+client_cbk_inodelk_contention (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+ int ret = -1;
+ struct iovec *iov = NULL;
+ struct gf_upcall upcall_data = {0,};
+ struct gf_upcall_inodelk_contention lc = {{0,},};
+ gfs4_inodelk_contention_req proto_lc = {{0,},};
+
+ GF_VALIDATE_OR_GOTO ("client-callback", rpc, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", mydata, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", data, out);
+
+ iov = (struct iovec *)data;
+ ret = xdr_to_generic (*iov, &proto_lc,
+ (xdrproc_t)xdr_gfs4_inodelk_contention_req);
+
+ if (ret < 0) {
+ gf_msg (THIS->name, GF_LOG_WARNING, -ret,
+ PC_MSG_INODELK_CONTENTION_FAIL,
+ "XDR decode of inodelk contention failed.");
+ goto out;
+ }
+
+ upcall_data.data = &lc;
+ ret = gf_proto_inodelk_contention_to_upcall (&proto_lc, &upcall_data);
+ if (ret < 0)
+ goto out;
+
+ upcall_data.event_type = GF_UPCALL_INODELK_CONTENTION;
+
+ default_notify (THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+ if (proto_lc.domain)
+ free (proto_lc.domain);
+
+ if (proto_lc.xdata.xdata_val)
+ free (proto_lc.xdata.xdata_val);
+
+ if (lc.xdata)
+ dict_unref (lc.xdata);
+
+ return ret;
+}
+
+int
+client_cbk_entrylk_contention (struct rpc_clnt *rpc, void *mydata, void *data)
+{
+ int ret = -1;
+ struct iovec *iov = NULL;
+ struct gf_upcall upcall_data = {0,};
+ struct gf_upcall_entrylk_contention lc = {0,};
+ gfs4_entrylk_contention_req proto_lc = {{0,},};
+
+ GF_VALIDATE_OR_GOTO ("client-callback", rpc, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", mydata, out);
+ GF_VALIDATE_OR_GOTO ("client-callback", data, out);
+
+ iov = (struct iovec *)data;
+ ret = xdr_to_generic (*iov, &proto_lc,
+ (xdrproc_t)xdr_gfs4_entrylk_contention_req);
+
+ if (ret < 0) {
+ gf_msg (THIS->name, GF_LOG_WARNING, -ret,
+ PC_MSG_ENTRYLK_CONTENTION_FAIL,
+ "XDR decode of entrylk contention failed.");
+ goto out;
+ }
+
+ upcall_data.data = &lc;
+ ret = gf_proto_entrylk_contention_to_upcall (&proto_lc, &upcall_data);
+ if (ret < 0)
+ goto out;
+
+ upcall_data.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+
+ default_notify (THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+ if (proto_lc.name)
+ free (proto_lc.name);
+
+ if (proto_lc.domain)
+ free (proto_lc.domain);
+
+ if (proto_lc.xdata.xdata_val)
+ free (proto_lc.xdata.xdata_val);
+
+ if (lc.xdata)
+ dict_unref (lc.xdata);
+
+ return ret;
+}
+
rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {
[GF_CBK_NULL] = {"NULL", GF_CBK_NULL, client_cbk_null },
[GF_CBK_FETCHSPEC] = {"FETCHSPEC", GF_CBK_FETCHSPEC, client_cbk_fetchspec },
@@ -181,6 +276,8 @@ rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {
[GF_CBK_CHILD_UP] = {"CHILD_UP", GF_CBK_CHILD_UP, client_cbk_child_up },
[GF_CBK_CHILD_DOWN] = {"CHILD_DOWN", GF_CBK_CHILD_DOWN, client_cbk_child_down },
[GF_CBK_RECALL_LEASE] = {"RECALL_LEASE", GF_CBK_RECALL_LEASE, client_cbk_recall_lease },
+ [GF_CBK_INODELK_CONTENTION] = {"INODELK_CONTENTION", GF_CBK_INODELK_CONTENTION, client_cbk_inodelk_contention },
+ [GF_CBK_ENTRYLK_CONTENTION] = {"ENTRYLK_CONTENTION", GF_CBK_ENTRYLK_CONTENTION, client_cbk_entrylk_contention },
};
diff --git a/xlators/protocol/client/src/client-messages.h b/xlators/protocol/client/src/client-messages.h
index 86b721bd593..5f146c67efe 100644
--- a/xlators/protocol/client/src/client-messages.h
+++ b/xlators/protocol/client/src/client-messages.h
@@ -89,7 +89,9 @@ GLFS_MSGID(PC,
PC_MSG_CACHE_INVALIDATION_FAIL,
PC_MSG_CHILD_STATUS,
PC_MSG_GFID_NULL,
- PC_MSG_RECALL_LEASE_FAIL
+ PC_MSG_RECALL_LEASE_FAIL,
+ PC_MSG_INODELK_CONTENTION_FAIL,
+ PC_MSG_ENTRYLK_CONTENTION_FAIL
);
#endif /* !_PC_MESSAGES_H__ */
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index 7154355e690..66122318c79 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -1349,6 +1349,8 @@ server_process_event_upcall (xlator_t *this, void *data)
enum gf_cbk_procnum cbk_procnum = GF_CBK_NULL;
gfs3_cbk_cache_invalidation_req gf_c_req = {0,};
gfs3_recall_lease_req gf_recall_lease = {{0,},};
+ gfs4_inodelk_contention_req gf_inodelk_contention = {{0},};
+ gfs4_entrylk_contention_req gf_entrylk_contention = {{0},};
xdrproc_t xdrproc;
GF_VALIDATE_OR_GOTO(this->name, data, out);
@@ -1358,7 +1360,16 @@ server_process_event_upcall (xlator_t *this, void *data)
upcall_data = (struct gf_upcall *)data;
client_uid = upcall_data->client_uid;
- GF_VALIDATE_OR_GOTO(this->name, client_uid, out);
+ /* client_uid could be NULL if the upcall was intended for a server's
+ * child xlator (so no client_uid available) but it hasn't handled
+ * the notification. For this reason we silently ignore any upcall
+ * request with a NULL client_uid, but -1 will be returned.
+ */
+ if (client_uid == NULL) {
+ gf_msg_debug(this->name, 0,
+ "NULL client_uid for an upcall request");
+ goto out;
+ }
switch (upcall_data->event_type) {
case GF_UPCALL_CACHE_INVALIDATION:
@@ -1381,6 +1392,28 @@ server_process_event_upcall (xlator_t *this, void *data)
cbk_procnum = GF_CBK_RECALL_LEASE;
xdrproc = (xdrproc_t)xdr_gfs3_recall_lease_req;
break;
+ case GF_UPCALL_INODELK_CONTENTION:
+ ret = gf_proto_inodelk_contention_from_upcall (this,
+ &gf_inodelk_contention,
+ upcall_data);
+ if (ret < 0)
+ goto out;
+
+ up_req = &gf_inodelk_contention;
+ cbk_procnum = GF_CBK_INODELK_CONTENTION;
+ xdrproc = (xdrproc_t)xdr_gfs4_inodelk_contention_req;
+ break;
+ case GF_UPCALL_ENTRYLK_CONTENTION:
+ ret = gf_proto_entrylk_contention_from_upcall (this,
+ &gf_entrylk_contention,
+ upcall_data);
+ if (ret < 0)
+ goto out;
+
+ up_req = &gf_entrylk_contention;
+ cbk_procnum = GF_CBK_ENTRYLK_CONTENTION;
+ xdrproc = (xdrproc_t)xdr_gfs4_entrylk_contention_req;
+ break;
default:
gf_msg (this->name, GF_LOG_WARNING, EINVAL,
PS_MSG_INVALID_ENTRY,
@@ -1417,11 +1450,10 @@ server_process_event_upcall (xlator_t *this, void *data)
pthread_mutex_unlock (&conf->mutex);
ret = 0;
out:
- if ((gf_c_req.xdata).xdata_val)
- GF_FREE ((gf_c_req.xdata).xdata_val);
-
- if ((gf_recall_lease.xdata).xdata_val)
- GF_FREE ((gf_recall_lease.xdata).xdata_val);
+ GF_FREE ((gf_c_req.xdata).xdata_val);
+ GF_FREE ((gf_recall_lease.xdata).xdata_val);
+ GF_FREE ((gf_inodelk_contention.xdata).xdata_val);
+ GF_FREE ((gf_entrylk_contention.xdata).xdata_val);
return ret;
}