diff options
22 files changed, 1203 insertions, 264 deletions
diff --git a/libglusterfs/src/upcall-utils.h b/libglusterfs/src/upcall-utils.h index 3b5dce33e45..48d10382c10 100644 --- a/libglusterfs/src/upcall-utils.h +++ b/libglusterfs/src/upcall-utils.h @@ -63,6 +63,8 @@ typedef enum { GF_UPCALL_EVENT_NULL, GF_UPCALL_CACHE_INVALIDATION, GF_UPCALL_RECALL_LEASE, + GF_UPCALL_INODELK_CONTENTION, + GF_UPCALL_ENTRYLK_CONTENTION, } gf_upcall_event_t; struct gf_upcall { @@ -88,4 +90,19 @@ struct gf_upcall_recall_lease { dict_t *dict; }; +struct gf_upcall_inodelk_contention { + struct gf_flock flock; + pid_t pid; + const char *domain; + dict_t *xdata; +}; + +struct gf_upcall_entrylk_contention { + uint32_t type; + pid_t pid; + const char *name; + const char *domain; + dict_t *xdata; +}; + #endif /* _UPCALL_UTILS_H */ diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index aee34302205..aafd94400c6 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -150,6 +150,8 @@ enum gf_cbk_procnum { GF_CBK_CHILD_DOWN, GF_CBK_RECALL_LEASE, GF_CBK_STATEDUMP, + GF_CBK_INODELK_CONTENTION, + GF_CBK_ENTRYLK_CONTENTION, GF_CBK_MAXVALUE, }; diff --git a/rpc/xdr/src/glusterfs3.h b/rpc/xdr/src/glusterfs3.h index 2da5594a347..eef39416b5c 100644 --- a/rpc/xdr/src/glusterfs3.h +++ b/rpc/xdr/src/glusterfs3.h @@ -419,6 +419,162 @@ gf_proto_cache_invalidation_to_upcall (xlator_t *this, return ret; } +static inline int +gf_proto_inodelk_contention_to_upcall (struct gfs4_inodelk_contention_req *lc, + struct gf_upcall *gf_up_data) +{ + struct gf_upcall_inodelk_contention *tmp = NULL; + xlator_t *this = NULL; + int ret = -1; + int op_errno = EINVAL; + + this = THIS; + + GF_VALIDATE_OR_GOTO(this->name, lc, out); + GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + + tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data; + + gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid); + + gf_proto_flock_to_flock(&lc->flock, &tmp->flock); + tmp->pid = lc->pid; + tmp->domain = lc->domain; + if ((tmp->domain != NULL) && (*tmp->domain == 0)) { + tmp->domain = NULL; + } + + GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val, + lc->xdata.xdata_len, ret, op_errno, out); + + ret = 0; + +out: + if (ret < 0) { + ret = -op_errno; + } + + return ret; +} + +static inline int +gf_proto_inodelk_contention_from_upcall (xlator_t *this, + struct gfs4_inodelk_contention_req *lc, + struct gf_upcall *gf_up_data) +{ + struct gf_upcall_inodelk_contention *tmp = NULL; + int ret = -1; + int op_errno = EINVAL; + + GF_VALIDATE_OR_GOTO(this->name, lc, out); + GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + + tmp = (struct gf_upcall_inodelk_contention *)gf_up_data->data; + + gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid); + + gf_proto_flock_from_flock(&lc->flock, &tmp->flock); + lc->pid = tmp->pid; + lc->domain = (char *)tmp->domain; + if (lc->domain == NULL) { + lc->domain = ""; + } + + GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val, + lc->xdata.xdata_len, op_errno, out); + + ret = 0; + +out: + if (ret < 0) { + ret = -op_errno; + } + + return ret; +} + +static inline int +gf_proto_entrylk_contention_to_upcall (struct gfs4_entrylk_contention_req *lc, + struct gf_upcall *gf_up_data) +{ + struct gf_upcall_entrylk_contention *tmp = NULL; + xlator_t *this = NULL; + int ret = -1; + int op_errno = EINVAL; + + this = THIS; + + GF_VALIDATE_OR_GOTO(this->name, lc, out); + GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + + tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data; + + gf_uuid_copy(gf_up_data->gfid, (unsigned char *)lc->gfid); + + tmp->type = lc->type; + tmp->name = lc->name; + if ((tmp->name != NULL) && (*tmp->name == 0)) { + tmp->name = NULL; + } + tmp->pid = lc->pid; + tmp->domain = lc->domain; + if ((tmp->domain != NULL) && (*tmp->domain == 0)) { + tmp->domain = NULL; + } + + GF_PROTOCOL_DICT_UNSERIALIZE (this, tmp->xdata, lc->xdata.xdata_val, + lc->xdata.xdata_len, ret, op_errno, out); + + ret = 0; + +out: + if (ret < 0) { + ret = -op_errno; + } + + return ret; +} + +static inline int +gf_proto_entrylk_contention_from_upcall (xlator_t *this, + struct gfs4_entrylk_contention_req *lc, + struct gf_upcall *gf_up_data) +{ + struct gf_upcall_entrylk_contention *tmp = NULL; + int ret = -1; + int op_errno = EINVAL; + + GF_VALIDATE_OR_GOTO(this->name, lc, out); + GF_VALIDATE_OR_GOTO(this->name, gf_up_data, out); + + tmp = (struct gf_upcall_entrylk_contention *)gf_up_data->data; + + gf_uuid_copy((unsigned char *)lc->gfid, gf_up_data->gfid); + + lc->type = tmp->type; + lc->name = (char *)tmp->name; + if (lc->name == NULL) { + lc->name = ""; + } + lc->pid = tmp->pid; + lc->domain = (char *)tmp->domain; + if (lc->domain == NULL) { + lc->domain = ""; + } + + GF_PROTOCOL_DICT_SERIALIZE (this, tmp->xdata, &lc->xdata.xdata_val, + lc->xdata.xdata_len, op_errno, out); + + ret = 0; + +out: + if (ret < 0) { + ret = -op_errno; + } + + return ret; +} + extern int dict_to_xdr (dict_t *this, gfx_dict *xdict); extern int xdr_to_dict (gfx_dict *xdict, dict_t **to); diff --git a/rpc/xdr/src/glusterfs4-xdr.x b/rpc/xdr/src/glusterfs4-xdr.x index 7396b566fa7..ef0cfde0802 100644 --- a/rpc/xdr/src/glusterfs4-xdr.x +++ b/rpc/xdr/src/glusterfs4-xdr.x @@ -86,3 +86,20 @@ struct gfs4_namelink_req { string bname<>; opaque xdata<>; }; + +struct gfs4_inodelk_contention_req { + opaque gfid[16]; + struct gf_proto_flock flock; + unsigned int pid; + string domain<>; + opaque xdata<>; +}; + +struct gfs4_entrylk_contention_req { + opaque gfid[16]; + unsigned int type; + unsigned int pid; + string name<>; + string domain<>; + opaque xdata<>; +}; diff --git a/rpc/xdr/src/libgfxdr.sym b/rpc/xdr/src/libgfxdr.sym index 8af956ef5a9..83f1efc732a 100644 --- a/rpc/xdr/src/libgfxdr.sym +++ b/rpc/xdr/src/libgfxdr.sym @@ -155,8 +155,12 @@ xdr_gfs3_xattrop_req xdr_gfs3_xattrop_rsp xdr_gfs3_zerofill_req xdr_gfs3_zerofill_rsp +xdr_gfs4_entrylk_contention_req +xdr_gfs4_entrylk_contention_rsp xdr_gfs4_icreate_req xdr_gfs4_icreate_rsp +xdr_gfs4_inodelk_contention_req +xdr_gfs4_inodelk_contention_rsp xdr_gfs4_namelink_req xdr_gfs4_namelink_rsp xdr_gf_set_lk_ver_req diff --git a/tests/basic/ec/lock-contention.t b/tests/basic/ec/lock-contention.t new file mode 100644 index 00000000000..8f86cee16ad --- /dev/null +++ b/tests/basic/ec/lock-contention.t @@ -0,0 +1,62 @@ +#!/bin/bash + +# This test verifies that when 'lock-notify-contention' option is enabled, +# locks xlator actually sends an upcall notification that causes the acquired +# lock from one client to be released before it's supposed to when another +# client accesses the file. + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function elapsed_time() { + local start="`date +%s`" + + if [[ "test" == `cat "$1"` ]]; then + echo "$((`date +%s` - ${start}))" + fi +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 features.locks-notify-contention off +TEST $CLI volume set $V0 disperse.eager-lock on +TEST $CLI volume set $V0 disperse.eager-lock-timeout 6 +TEST $CLI volume set $V0 disperse.other-eager-lock on +TEST $CLI volume set $V0 disperse.other-eager-lock-timeout 6 +TEST $CLI volume start $V0 + +TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M0 + +TEST $GFS --direct-io-mode=yes --volfile-id=/$V0 --volfile-server=$H0 $M1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 $M1 + +TEST $(echo "test" >$M0/file) + +# With locks-notify-contention set to off, accessing the file from another +# client should take 6 seconds. Checking against 3 seconds to be safe. +elapsed="$(elapsed_time $M1/file)" +TEST [[ ${elapsed} -ge 3 ]] + +elapsed="$(elapsed_time $M0/file)" +TEST [[ ${elapsed} -ge 3 ]] + +TEST $CLI volume set $V0 features.locks-notify-contention on + +# With locks-notify-contention set to on, accessing the file from another +# client should be fast. Checking against 3 seconds to be safe. +elapsed="$(elapsed_time $M1/file)" +TEST [[ ${elapsed} -le 3 ]] + +elapsed="$(elapsed_time $M0/file)" +TEST [[ ${elapsed} -le 3 ]] + +cleanup diff --git a/tests/volume.rc b/tests/volume.rc index 3a48b43d2e0..3ee83624058 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -93,7 +93,8 @@ function remove_brick_status_completed_field { function get_mount_process_pid { local vol=$1 - ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1 + local mnt=$2 + ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1 } function get_nfs_pid () @@ -126,7 +127,8 @@ function generate_statedump { function generate_mount_statedump { local vol=$1 - generate_statedump $(get_mount_process_pid $vol) + local mnt=$2 + generate_statedump $(get_mount_process_pid $vol $mnt) } function cleanup_mount_statedump { @@ -205,14 +207,16 @@ function ec_child_up_status { local vol=$1 local dist_id=$2 local brick_id=$(($3 + 1)) - local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol)) + local mnt=$4 + local mask=$(ec_get_info $vol $dist_id "childs_up_mask" $(generate_mount_statedump $vol $mnt)) echo "${mask: -$brick_id:1}" } function ec_child_up_count { local vol=$1 local dist_id=$2 - ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol) + local mnt=$3 + ec_get_info $vol $dist_id "childs_up" $(generate_mount_statedump $vol $mnt) } function ec_child_up_status_shd { diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index cb627a92c9c..fbc7ac97aa0 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link) return _gf_true; } +static ec_lock_link_t * +ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock) +{ + ec_lock_link_t *timer_link; + + /* If we don't have any timer, there's nothing to cancel. */ + if (lock->timer == NULL) { + return NULL; + } + + /* We are trying to access a lock that has an unlock timer active. + * This means that the lock must be idle, i.e. no fop can be in the + * owner, waiting or frozen lists. It also means that the lock cannot + * have been marked as being released (this is done without timers). + * There should only be one owner reference, but it's possible that + * some fops are being prepared to use this lock. */ + GF_ASSERT ((lock->refs_owners == 1) && + list_empty(&lock->owners) && list_empty(&lock->waiting)); + + /* We take the timer_link before cancelling the timer, since a + * successful cancellation will destroy it. It must not be NULL + * because it references the fop responsible for the delayed unlock + * that we are currently trying to cancel. */ + timer_link = lock->timer->data; + GF_ASSERT(timer_link != NULL); + + if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) { + /* It's too late to avoid the execution of the timer callback. + * Since we need to be sure that the callback has access to all + * needed resources, we cannot resume the execution of the + * timer fop now. This will be done in the callback. */ + timer_link = NULL; + } else { + /* The timer has been cancelled. The fop referenced by + * timer_link holds the last reference. The caller is + * responsible to release it when not needed anymore. */ + ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); + } + + /* We have two options here: + * + * 1. The timer has been successfully cancelled. + * + * This is the easiest case and we can continue with the currently + * acquired lock. + * + * 2. The timer callback has already been fired. + * + * In this case we have not been able to cancel the timer before + * the timer callback has been fired, but we also know that + * lock->timer != NULL. This means that the timer callback is still + * trying to acquire the inode mutex that we currently own. We are + * safe until we release it. In this case we can safely clear + * lock->timer. This will cause that the timer callback does nothing + * once it acquires the mutex. + */ + lock->timer = NULL; + + return timer_link; +} + static gf_boolean_t ec_lock_assign_owner(ec_lock_link_t *link) { @@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link) * empty. */ GF_ASSERT(list_empty(&lock->frozen)); - if (lock->timer != NULL) { - /* We are trying to acquire a lock that has an unlock timer active. - * This means that the lock must be idle, i.e. no fop can be in the - * owner, waiting or frozen lists. It also means that the lock cannot - * have been marked as being released (this is done without timers). - * There should only be one owner reference, but it's possible that - * some fops are being prepared to use this lock. - */ - GF_ASSERT ((lock->refs_owners == 1) && - list_empty(&lock->owners) && list_empty(&lock->waiting)); - - /* We take the timer_link before cancelling the timer, since a - * successful cancellation will destroy it. It must not be NULL - * because it references the fop responsible for the delayed unlock - * that we are currently trying to cancel. */ - timer_link = lock->timer->data; - GF_ASSERT(timer_link != NULL); - - if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) { - /* It's too late to avoid the execution of the timer callback. - * Since we need to be sure that the callback has access to all - * needed resources, we cannot resume the execution of the timer - * fop now. This will be done in the callback. - */ - timer_link = NULL; - } else { - /* The timer has been cancelled, so we need to release the owner - * reference that was held by the fop waiting for the timer. This - * can be the last reference, but we'll immediately increment it - * for the current fop, so no need to check it. - */ - lock->refs_owners--; - - ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock); - } - - /* We have two options here: - * - * 1. The timer has been successfully cancelled. - * - * This is the easiest case and we can continue with the currently - * acquired lock. - * - * 2. The timer callback has already been fired. - * - * In this case we have not been able to cancel the timer before - * the timer callback has been fired, but we also know that - * lock->timer != NULL. This means that the timer callback is still - * trying to acquire the inode mutex that we currently own. We are - * safe until we release it. In this case we can safely clear - * lock->timer. This will cause that the timer callback does nothing - * once it acquires the mutex. - */ - lock->timer = NULL; - } + timer_link = ec_lock_timer_cancel(fop->xl, lock); if (!list_empty(&lock->owners)) { /* There are other owners of this lock. We can only take ownership if @@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link) } list_add_tail(&link->owner_list, &lock->owners); - lock->refs_owners++; + + /* If timer_link is not NULL, it means that we have inherited the owner + * reference assigned to the timer fop. In this case we simply reuse it. + * Otherwise we need to increase the number of owners. */ + if (timer_link == NULL) { + lock->refs_owners++; + } assigned = _gf_true; @@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link) ec_resume(link->fop, 0); } +void +ec_lock_release(ec_t *ec, inode_t *inode) +{ + ec_lock_t *lock; + ec_inode_t *ctx; + ec_lock_link_t *timer_link = NULL; + + LOCK(&inode->lock); + + ctx = __ec_inode_get(inode, ec->xl); + if (ctx == NULL) { + goto done; + } + lock = ctx->inode_lock; + if ((lock == NULL) || !lock->acquired || lock->release) { + goto done; + } + + gf_msg_debug(ec->xl->name, 0, + "Releasing inode %p due to lock contention", inode); + + /* The lock is not marked to be released, so the frozen list should be + * empty. */ + GF_ASSERT(list_empty(&lock->frozen)); + + timer_link = ec_lock_timer_cancel(ec->xl, lock); + + /* We mark the lock to be released as soon as possible. */ + lock->release = _gf_true; + +done: + UNLOCK(&inode->lock); + + /* If we have cancelled the timer, we need to start the unlock of the + * inode. If there was a timer but we have been unable to cancel it + * because it was just triggered, the timer callback will take care + * of releasing the inode. */ + if (timer_link != NULL) { + ec_unlock_now(timer_link); + } +} + void ec_unlock_timer_add(ec_lock_link_t *link); void @@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data) ec_unlock_timer_del(data); } +static gf_boolean_t +ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop) +{ + /* Fops with no locks at this point mean that they are sent as sub-fops + * of other higher level fops. In this case we simply assume that the + * parent fop will take correct care of the eager lock. */ + if (fop->lock_count == 0) { + return _gf_true; + } + + /* We may have more than one lock, but this only happens in the rename + * fop, and both locks will reference an inode of the same type (a + * directory in this case), so we only need to check the first lock. */ + if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock; + } + + return ec->other_eager_lock; +} + +static uint32_t +ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock) +{ + if (lock->loc.inode->ia_type == IA_IFREG) { + return ec->eager_lock_timeout; + } + + return ec->other_eager_lock_timeout; +} + +static gf_boolean_t +ec_lock_delay_create(ec_lock_link_t *link) +{ + struct timespec delay; + ec_fop_data_t *fop = link->fop; + ec_lock_t *lock = link->lock; + + delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock); + delay.tv_nsec = 0; + lock->timer = gf_timer_call_after(fop->xl->ctx, delay, + ec_unlock_timer_cbk, link); + if (lock->timer == NULL) { + gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, + EC_MSG_UNLOCK_DELAY_FAILED, + "Unable to delay an unlock"); + + return _gf_false; + } + + return _gf_true; +} + void ec_unlock_timer_add(ec_lock_link_t *link) { - struct timespec delay; ec_fop_data_t *fop = link->fop; ec_lock_t *lock = link->lock; gf_boolean_t now = _gf_false; @@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link) ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock, lock->release); - delay.tv_sec = 1; - delay.tv_nsec = 0; - lock->timer = gf_timer_call_after(fop->xl->ctx, delay, - ec_unlock_timer_cbk, link); - if (lock->timer == NULL) { - gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM, - EC_MSG_UNLOCK_DELAY_FAILED, - "Unable to delay an unlock"); - + if (!ec_lock_delay_create(link)) { /* We are unable to create a new timer. We immediately release * the lock. */ lock->release = now = _gf_true; } + } else { ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock, lock->release); @@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop) ec_update_info(&fop->locks[0]); } -static gf_boolean_t -ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) -{ - /* Fops with no locks at this point mean that they are sent as sub-fops - * of other higher level fops. In this case we simply assume that the - * parent fop will take correct care of the eager lock. */ - if (fop->lock_count == 0) { - return _gf_true; - } - - /* We may have more than one lock, but this only happens in the rename - * fop, and both locks will reference an inode of the same type (a - * directory in this case), so we only need to check the first lock. */ - if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { - return ec->eager_lock; - } - - return ec->other_eager_lock; -} - static void ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe, ec_fop_data_t *fop) @@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop) ec = fop->xl->private; cbk = fop->answer; - if (ec_use_eager_lock(ec, fop) && cbk != NULL) { + if (ec_eager_lock_used(ec, fop) && cbk != NULL) { if (cbk->xdata != NULL) { if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) == 0) && (count > 1)) { diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index c3e291585ef..99e2f0653be 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, void ec_lock(ec_fop_data_t * fop); void ec_lock_reuse(ec_fop_data_t *fop); void ec_unlock(ec_fop_data_t * fop); +void ec_lock_release(ec_t *ec, inode_t *inode); gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size); diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 23dc434bc42..15b4c77abfe 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -669,6 +669,8 @@ struct _ec { uint32_t background_heals; uint32_t heal_wait_qlen; uint32_t self_heal_window_size; /* max size of read/writes */ + uint32_t eager_lock_timeout; + uint32_t other_eager_lock_timeout; struct list_head pending_fops; struct list_head heal_waiting; struct list_head healing; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 4c80f1283f1..30b0bdcb29c 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options) bool, failed); GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options, bool, failed); + GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout, + options, uint32, failed); + GF_OPTION_RECONF ("other-eager-lock-timeout", + ec->other_eager_lock_timeout, options, uint32, + failed); GF_OPTION_RECONF ("background-heals", background_heals, options, uint32, failed); GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options, @@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) } } +static gf_boolean_t +ec_upcall(ec_t *ec, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *ci = NULL; + struct gf_upcall_inodelk_contention *lc = NULL; + inode_t *inode; + + switch (upcall->event_type) { + case GF_UPCALL_CACHE_INVALIDATION: + ci = upcall->data; + ci->flags |= UP_INVAL_ATTR; + return _gf_true; + + case GF_UPCALL_INODELK_CONTENTION: + lc = upcall->data; + if (strcmp(lc->domain, ec->xl->name) != 0) { + /* The lock is not owned by EC, ignore it. */ + return _gf_true; + } + inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable, + upcall->gfid); + /* If inode is not found, it means that it's already released, + * so we can ignore it. Probably it has been released and + * destroyed while the contention notification was being sent. + */ + if (inode != NULL) { + ec_lock_release(ec, inode); + inode_unref(inode); + } + + return _gf_false; + + default: + return _gf_true; + } +} + int32_t ec_notify (xlator_t *this, int32_t event, void *data, void *data2) { @@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) dict_t *output = NULL; gf_boolean_t propagate = _gf_true; int32_t orig_event = event; - struct gf_upcall *up_data = NULL; - struct gf_upcall_cache_invalidation *up_ci = NULL; uintptr_t mask = 0; gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p", event, data, data2); if (event == GF_EVENT_UPCALL) { - up_data = (struct gf_upcall *)data; - if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) { - up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - up_ci->flags |= UP_INVAL_ATTR; - } + propagate = ec_upcall(ec, data); goto done; } @@ -664,6 +700,10 @@ init (xlator_t *this) GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed); GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed); GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed); + GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32, + failed); + GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout, + uint32, failed); GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed); GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed); GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32, @@ -1456,6 +1496,29 @@ struct volume_options options[] = .description = "It's equivalent to the eager-lock option but for non " "regular files." }, + { .key = {"eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = { GD_OP_VERSION_4_0_0 }, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = { "disperse", "locks", "timeout" }, + .description = "Maximum time (in seconds) that a lock on an inode is " + "kept held if no new operations on the inode are " + "received." + }, + { .key = {"other-eager-lock-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 60, + .default_value = "1", + .op_version = { GD_OP_VERSION_4_0_0 }, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = { "disperse", "locks", "timeout" }, + .description = "It's equivalent ot eager-lock-timeout option but for " + "non regular files." + }, { .key = {"background-heals"}, .type = GF_OPTION_TYPE_INT, .min = 0,/*Disabling background heals*/ diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index a76d6beacb1..1609fc416d2 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -200,6 +200,7 @@ int clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, clrlk_args *args, int *blkd, int *granted, int *op_errno) { + posix_locks_private_t *priv; pl_inode_lock_t *ilock = NULL; pl_inode_lock_t *tmp = NULL; struct gf_flock ulock = {0, }; @@ -207,9 +208,20 @@ clrlk_clear_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, int bcount = 0; int gcount = 0; gf_boolean_t chk_range = _gf_false; + struct list_head *pcontend = NULL; struct list_head released; + struct list_head contend; + struct timespec now = { }; INIT_LIST_HEAD (&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD (pcontend); + timespec_now(&now); + } + if (clrlk_get_lock_range (args->opts, &ulock, &chk_range)) { *op_errno = EINVAL; goto out; @@ -283,7 +295,10 @@ granted: ret = 0; out: - grant_blocked_inode_locks (this, pl_inode, dom); + grant_blocked_inode_locks (this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } *blkd = bcount; *granted = gcount; return ret; @@ -294,15 +309,27 @@ int clrlk_clear_entrylk (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom, clrlk_args *args, int *blkd, int *granted, int *op_errno) { + posix_locks_private_t *priv; pl_entry_lock_t *elock = NULL; pl_entry_lock_t *tmp = NULL; int bcount = 0; int gcount = 0; int ret = -1; + struct list_head *pcontend = NULL; struct list_head removed; struct list_head released; + struct list_head contend; + struct timespec now; INIT_LIST_HEAD (&released); + + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD (pcontend); + timespec_now(&now); + } + if (args->kind & CLRLK_BLOCKED) goto blkd; @@ -361,12 +388,15 @@ granted: list_del_init (&elock->domain_list); list_add_tail (&elock->domain_list, &removed); - __pl_entrylk_unref (elock); + __pl_entrylk_unref (elock); } } pthread_mutex_unlock (&pl_inode->mutex); - grant_blocked_entry_locks (this, pl_inode, dom); + grant_blocked_entry_locks (this, pl_inode, dom, &now, pcontend); + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } ret = 0; out: diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 3729ca24bed..50c156feb38 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -69,7 +69,11 @@ get_domain (pl_inode_t *pl_inode, const char *volume); void grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom); + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); + +void +inodelk_contention_notify (xlator_t *this, struct list_head *contend); void __delete_inode_lock (pl_inode_lock_t *lock); @@ -79,7 +83,11 @@ __pl_inodelk_unref (pl_inode_lock_t *lock); void grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom); + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); + +void +entrylk_contention_notify (xlator_t *this, struct list_head *contend); void pl_update_refkeeper (xlator_t *this, inode_t *inode); diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 6698516fc83..008d05a34c4 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -13,17 +13,19 @@ #include "logging.h" #include "common-utils.h" #include "list.h" +#include "upcall-utils.h" #include "locks.h" #include "clear.h" #include "common.h" +#include "pl-messages.h" void __pl_entrylk_unref (pl_entry_lock_t *lock) { lock->ref--; if (!lock->ref) { - GF_FREE ((char *)lock->basename); + GF_FREE ((char *)lock->basename); GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -39,7 +41,7 @@ __pl_entrylk_ref (pl_entry_lock_t *lock) static pl_entry_lock_t * new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, - const char *domain, call_frame_t *frame, char *conn_id) + const char *domain, call_frame_t *frame, char *conn_id) { pl_entry_lock_t *newlock = NULL; @@ -55,7 +57,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, newlock->client_pid = frame->root->pid; newlock->volume = domain; newlock->owner = frame->root->lk_owner; - newlock->frame = frame; + newlock->frame = frame; newlock->this = frame->this; if (conn_id) { @@ -64,9 +66,9 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, INIT_LIST_HEAD (&newlock->domain_list); INIT_LIST_HEAD (&newlock->blocked_locks); - INIT_LIST_HEAD (&newlock->client_list); + INIT_LIST_HEAD (&newlock->client_list); - __pl_entrylk_ref (newlock); + __pl_entrylk_ref (newlock); out: return newlock; } @@ -201,6 +203,113 @@ out: return revoke_lock; } +static gf_boolean_t +__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, + struct timespec *now) +{ + posix_locks_private_t *priv; + int64_t elapsed; + + priv = this->private; + + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return _gf_false; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return _gf_false; + } + + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pinode->inode); + __pl_entrylk_ref(lock); + + lock->contention_time = *now; + + return _gf_true; +} + +void +entrylk_contention_notify(xlator_t *this, struct list_head *contend) +{ + struct gf_upcall up; + struct gf_upcall_entrylk_contention lc; + pl_entry_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_entry_lock_t, contend); + + pl_inode = lock->pinode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->domain_list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + lc.type = lock->type; + lc.name = lock->basename; + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pinode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the entrylk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_ENTRYLK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Entrylk contention notification " + "sent"); + } + } + + pthread_mutex_lock(&pl_inode->mutex); + + list_del_init(&lock->contend); + __pl_entrylk_unref(lock); + + pthread_mutex_unlock(&pl_inode->mutex); + + inode_unref(pl_inode->inode); + } +} + + /** * entrylk_grantable - is this lock grantable? * @inode: inode in which to look @@ -208,19 +317,27 @@ out: * @type: type of lock */ static pl_entry_lock_t * -__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock) +__entrylk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock, + struct timespec *now, struct list_head *contend) { pl_entry_lock_t *tmp = NULL; - - if (list_empty (&dom->entrylk_list)) - return NULL; + pl_entry_lock_t *ret = NULL; list_for_each_entry (tmp, &dom->entrylk_list, domain_list) { - if (__conflicting_entrylks (tmp, lock)) - return tmp; + if (__conflicting_entrylks (tmp, lock)) { + if (ret == NULL) { + ret = tmp; + if (contend == NULL) { + break; + } + } + if (__entrylk_needs_contention_notify(this, tmp, now)) { + list_add_tail(&tmp->contend, contend); + } + } } - return NULL; + return ret; } static pl_entry_lock_t * @@ -228,9 +345,6 @@ __blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock) { pl_entry_lock_t *tmp = NULL; - if (list_empty (&dom->blocked_entrylks)) - return NULL; - list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) { if (names_conflict (tmp->basename, lock->basename)) return lock; @@ -426,6 +540,27 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock) return NULL; } +static int +__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock, int nonblock) +{ + struct timeval now; + + gettimeofday(&now, NULL); + + if (nonblock) + goto out; + + lock->blkd_time = now; + list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); + + gf_msg_trace (this->name, 0, "Blocking lock: {pinode=%p, basename=%s}", + pinode, lock->basename); + +out: + return -EAGAIN; +} + /** * __lock_entrylk - lock a name in a directory * @inode: inode for the directory in which to lock @@ -439,24 +574,15 @@ __find_matching_lock (pl_dom_list_t *dom, pl_entry_lock_t *lock) int __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, - int nonblock, pl_dom_list_t *dom) + int nonblock, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { pl_entry_lock_t *conf = NULL; int ret = -EAGAIN; - conf = __entrylk_grantable (dom, lock); + conf = __entrylk_grantable (this, dom, lock, now, contend); if (conf) { - ret = -EAGAIN; - if (nonblock) - goto out; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, lock->basename); - + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); goto out; } @@ -471,20 +597,15 @@ __lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock, * granted, without which self-heal can't progress. * TODO: Find why 'owner_has_lock' is checked even for blocked locks. */ - if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (nonblock) - goto out; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks); - - gf_log (this->name, GF_LOG_DEBUG, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "Blocking lock: {pinode=%p, basename=%s}", - pinode, lock->basename); + if (__blocked_entrylk_conflict (dom, lock) && + !(__owner_has_lock (dom, lock))) { + if (nonblock == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); + } + ret = __lock_blocked_add(this, pinode, dom, lock, nonblock); goto out; } @@ -551,7 +672,8 @@ out: void __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom, struct list_head *granted) + pl_dom_list_t *dom, struct list_head *granted, + struct timespec *now, struct list_head *contend) { int bl_ret = 0; pl_entry_lock_t *bl = NULL; @@ -566,7 +688,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, list_del_init (&bl->blocked_locks); - bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom); + bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom, now, + contend); if (bl_ret == 0) { list_add (&bl->blocked_locks, granted); @@ -578,7 +701,8 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, /* Grants locks if possible which are blocked on a lock */ void grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom) + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { struct list_head granted_list; pl_entry_lock_t *tmp = NULL; @@ -589,7 +713,7 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, pthread_mutex_lock (&pl_inode->mutex); { __grant_blocked_entry_locks (this, pl_inode, dom, - &granted_list); + &granted_list, now, contend); } pthread_mutex_unlock (&pl_inode->mutex); @@ -610,8 +734,6 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, } } pthread_mutex_unlock (&pl_inode->mutex); - - return; } @@ -637,9 +759,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, int nonblock = 0; gf_boolean_t need_inode_unref = _gf_false; posix_locks_private_t *priv = NULL; + struct list_head *pcontend = NULL; + struct list_head contend; + struct timespec now = { }; priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + if (xdata) dict_ret = dict_get_str (xdata, "connection-id", &conn_id); @@ -722,7 +853,8 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, { reqlock->pinode = pinode; - ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom); + ret = __lock_entrylk (this, pinode, reqlock, nonblock, + dom, &now, pcontend); if (ret == 0) { reqlock->frame = NULL; op_ret = 0; @@ -778,7 +910,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, if (ctx) pthread_mutex_unlock (&ctx->lock); - grant_blocked_entry_locks (this, pinode, dom); + grant_blocked_entry_locks (this, pinode, dom, &now, pcontend); break; @@ -810,6 +942,10 @@ unwind: cmd, type); } + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } + return 0; } @@ -868,27 +1004,37 @@ pl_entrylk_log_cleanup (pl_entry_lock_t *lock) int pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) { + posix_locks_private_t *priv; pl_entry_lock_t *tmp = NULL; pl_entry_lock_t *l = NULL; - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; pl_inode_t *pinode = NULL; - + struct list_head *pcontend = NULL; struct list_head released; struct list_head unwind; + struct list_head contend; + struct timespec now = { }; INIT_LIST_HEAD (&released); INIT_LIST_HEAD (&unwind); - pthread_mutex_lock (&ctx->lock); + priv = this->private; + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD (pcontend); + timespec_now(&now); + } + + pthread_mutex_lock (&ctx->lock); { list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers, - client_list) { - pl_entrylk_log_cleanup (l); + client_list) { + pl_entrylk_log_cleanup (l); - pinode = l->pinode; + pinode = l->pinode; - pthread_mutex_lock (&pinode->mutex); - { + pthread_mutex_lock (&pinode->mutex); + { /* If the entrylk object is part of granted list but not * blocked list, then perform the following actions: * i. delete the object from granted list; @@ -931,38 +1077,42 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) &unwind); } } - pthread_mutex_unlock (&pinode->mutex); + pthread_mutex_unlock (&pinode->mutex); } - } + } pthread_mutex_unlock (&ctx->lock); list_for_each_entry_safe (l, tmp, &unwind, client_list) { list_del_init (&l->client_list); - if (l->frame) - STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN, - NULL); + if (l->frame) + STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN, + NULL); list_add_tail (&l->client_list, &released); } list_for_each_entry_safe (l, tmp, &released, client_list) { list_del_init (&l->client_list); - pinode = l->pinode; + pinode = l->pinode; - dom = get_domain (pinode, l->volume); + dom = get_domain (pinode, l->volume); - grant_blocked_entry_locks (this, pinode, dom); + grant_blocked_entry_locks (this, pinode, dom, &now, pcontend); - pthread_mutex_lock (&pinode->mutex); - { - __pl_entrylk_unref (l); - } - pthread_mutex_unlock (&pinode->mutex); + pthread_mutex_lock (&pinode->mutex); + { + __pl_entrylk_unref (l); + } + pthread_mutex_unlock (&pinode->mutex); inode_unref (pinode->inode); } + if (pcontend != NULL) { + entrylk_contention_notify(this, pcontend); + } + return 0; } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 64ffb00c18c..890ac8b6d00 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -13,10 +13,12 @@ #include "logging.h" #include "common-utils.h" #include "list.h" +#include "upcall-utils.h" #include "locks.h" #include "clear.h" #include "common.h" +#include "pl-messages.h" void __delete_inode_lock (pl_inode_lock_t *lock) @@ -229,22 +231,134 @@ out: return revoke_lock; } +static gf_boolean_t +__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, + struct timespec *now) +{ + posix_locks_private_t *priv; + int64_t elapsed; + + priv = this->private; + + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { + return _gf_false; + } + + elapsed = now->tv_sec; + elapsed -= lock->contention_time.tv_sec; + if (now->tv_nsec < lock->contention_time.tv_nsec) { + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { + return _gf_false; + } + + /* All contention notifications will be sent outside of the locked + * region. This means that currently granted locks might have already + * been unlocked by that time. To avoid the lock or the inode to be + * destroyed before we process them, we take an additional reference + * on both. */ + inode_ref(lock->pl_inode->inode); + __pl_inodelk_ref(lock); + + lock->contention_time = *now; + + return _gf_true; +} + +void +inodelk_contention_notify(xlator_t *this, struct list_head *contend) +{ + struct gf_upcall up; + struct gf_upcall_inodelk_contention lc; + pl_inode_lock_t *lock; + pl_inode_t *pl_inode; + client_t *client; + gf_boolean_t notify; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, contend); + + pl_inode = lock->pl_inode; + + pthread_mutex_lock(&pl_inode->mutex); + + /* If the lock has already been released, no notification is + * sent. We clear the notification time in this case. */ + notify = !list_empty(&lock->list); + if (!notify) { + lock->contention_time.tv_sec = 0; + lock->contention_time.tv_nsec = 0; + } else { + memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock)); + lc.pid = lock->client_pid; + lc.domain = lock->volume; + lc.xdata = NULL; + + gf_uuid_copy(up.gfid, lock->pl_inode->gfid); + client = (client_t *)lock->client; + if (client == NULL) { + /* A NULL client can be found if the inodelk + * was issued by a server side xlator. */ + up.client_uid = NULL; + } else { + up.client_uid = client->client_uid; + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + if (notify) { + up.event_type = GF_UPCALL_INODELK_CONTENTION; + up.data = &lc; + + if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "failed"); + } else { + gf_msg_debug(this->name, 0, + "Inodelk contention notification " + "sent"); + } + } + + pthread_mutex_lock(&pl_inode->mutex); + + list_del_init(&lock->contend); + __pl_inodelk_unref(lock); + + pthread_mutex_unlock(&pl_inode->mutex); + + inode_unref(pl_inode->inode); + } +} + /* Determine if lock is grantable or not */ static pl_inode_lock_t * -__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) +__inodelk_grantable (xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + struct timespec *now, struct list_head *contend) { pl_inode_lock_t *l = NULL; pl_inode_lock_t *ret = NULL; - if (list_empty (&dom->inodelk_list)) - goto out; + list_for_each_entry (l, &dom->inodelk_list, list){ if (inodelk_conflict (lock, l) && !same_inodelk_owner (lock, l)) { - ret = l; - goto out; + if (ret == NULL) { + ret = l; + if (contend == NULL) { + break; + } + } + if (__inodelk_needs_contention_notify(this, l, now)) { + list_add_tail(&l->contend, contend); + } } } -out: + return ret; } @@ -252,20 +366,14 @@ static pl_inode_lock_t * __blocked_lock_conflict (pl_dom_list_t *dom, pl_inode_lock_t *lock) { pl_inode_lock_t *l = NULL; - pl_inode_lock_t *ret = NULL; - - if (list_empty (&dom->blocked_inodelks)) - return NULL; list_for_each_entry (l, &dom->blocked_inodelks, blocked_locks) { if (inodelk_conflict (lock, l)) { - ret = l; - goto out; + return l; } } -out: - return ret; + return NULL; } static int @@ -286,35 +394,45 @@ __owner_has_lock (pl_dom_list_t *dom, pl_inode_lock_t *newlock) return 0; } +static int +__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + int can_block) +{ + struct timeval now; + + gettimeofday(&now, NULL); + + if (can_block == 0) { + goto out; + } + + lock->blkd_time = now; + list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); + + gf_msg_trace (this->name, 0, "%s (pid=%d) (lk-owner=%s) %"PRId64" - " + "%"PRId64" => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa (&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); + +out: + return -EAGAIN; +} /* Determines if lock can be granted and adds the lock. If the lock * is blocking, adds it to the blocked_inodelks list of the domain. */ static int __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, - int can_block, pl_dom_list_t *dom) + int can_block, pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { pl_inode_lock_t *conf = NULL; int ret = -EINVAL; - conf = __inodelk_grantable (dom, lock); + conf = __inodelk_grantable (this, dom, lock, now, contend); if (conf) { - ret = -EAGAIN; - if (can_block == 0) - goto out; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); - - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) lk-owner:%s %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - - + ret = __lock_blocked_add(this, dom, lock, can_block); goto out; } @@ -330,25 +448,15 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, * will not be unlocked by SHD from Machine1. * TODO: Find why 'owner_has_lock' is checked even for blocked locks. */ - if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { - ret = -EAGAIN; - if (can_block == 0) - goto out; - - gettimeofday (&lock->blkd_time, NULL); - list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); - - gf_log (this->name, GF_LOG_DEBUG, - "Lock is grantable, but blocking to prevent starvation"); - gf_log (this->name, GF_LOG_TRACE, - "%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked", - lock->fl_type == F_UNLCK ? "Unlock" : "Lock", - lock->client_pid, - lkowner_utoa (&lock->owner), - lock->user_flock.l_start, - lock->user_flock.l_len); - + if (__blocked_lock_conflict (dom, lock) && + !(__owner_has_lock (dom, lock))) { + if (can_block != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Lock is grantable, but blocking to prevent " + "starvation"); + } + ret = __lock_blocked_add(this, dom, lock, can_block); goto out; } __pl_inodelk_ref (lock); @@ -417,7 +525,8 @@ out: static void __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - struct list_head *granted, pl_dom_list_t *dom) + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend) { int bl_ret = 0; pl_inode_lock_t *bl = NULL; @@ -432,7 +541,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, list_del_init (&bl->blocked_locks); - bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom); + bl_ret = __lock_inodelk (this, pl_inode, bl, 1, dom, now, + contend); if (bl_ret == 0) { list_add (&bl->blocked_locks, granted); @@ -444,7 +554,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, /* Grant all inodelks blocked on a lock */ void grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, - pl_dom_list_t *dom) + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend) { struct list_head granted; pl_inode_lock_t *lock; @@ -454,7 +565,8 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pthread_mutex_lock (&pl_inode->mutex); { - __grant_blocked_inode_locks (this, pl_inode, &granted, dom); + __grant_blocked_inode_locks (this, pl_inode, &granted, dom, + now, contend); } pthread_mutex_unlock (&pl_inode->mutex); @@ -471,7 +583,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, &lock->user_flock, 0, 0, lock->volume); STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL); - lock->frame = NULL; + lock->frame = NULL; } pthread_mutex_lock (&pl_inode->mutex); @@ -488,9 +600,9 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, static void pl_inodelk_log_cleanup (pl_inode_lock_t *lock) { - pl_inode_t *pl_inode = NULL; + pl_inode_t *pl_inode = NULL; - pl_inode = lock->pl_inode; + pl_inode = lock->pl_inode; gf_log (THIS->name, GF_LOG_WARNING, "releasing lock on %s held by " "{client=%p, pid=%"PRId64" lk-owner=%s}", @@ -503,27 +615,38 @@ pl_inodelk_log_cleanup (pl_inode_lock_t *lock) int pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) { + posix_locks_private_t *priv; pl_inode_lock_t *tmp = NULL; pl_inode_lock_t *l = NULL; - pl_dom_list_t *dom = NULL; + pl_dom_list_t *dom = NULL; pl_inode_t *pl_inode = NULL; - + struct list_head *pcontend = NULL; struct list_head released; struct list_head unwind; + struct list_head contend; + struct timespec now = { }; + + priv = this->private; INIT_LIST_HEAD (&released); INIT_LIST_HEAD (&unwind); - pthread_mutex_lock (&ctx->lock); + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD (pcontend); + timespec_now(&now); + } + + pthread_mutex_lock (&ctx->lock); { list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers, - client_list) { - pl_inodelk_log_cleanup (l); + client_list) { + pl_inodelk_log_cleanup (l); - pl_inode = l->pl_inode; + pl_inode = l->pl_inode; - pthread_mutex_lock (&pl_inode->mutex); - { + pthread_mutex_lock (&pl_inode->mutex); + { /* If the inodelk object is part of granted list but not * blocked list, then perform the following actions: * i. delete the object from granted list; @@ -567,45 +690,49 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) &unwind); } } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_unlock (&pl_inode->mutex); } - } + } pthread_mutex_unlock (&ctx->lock); list_for_each_entry_safe (l, tmp, &unwind, client_list) { list_del_init (&l->client_list); if (l->frame) - STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, - NULL); + STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, + NULL); list_add_tail (&l->client_list, &released); - } list_for_each_entry_safe (l, tmp, &released, client_list) { list_del_init (&l->client_list); - pl_inode = l->pl_inode; + pl_inode = l->pl_inode; - dom = get_domain (pl_inode, l->volume); + dom = get_domain (pl_inode, l->volume); - grant_blocked_inode_locks (this, pl_inode, dom); + grant_blocked_inode_locks (this, pl_inode, dom, &now, + pcontend); - pthread_mutex_lock (&pl_inode->mutex); - { - __pl_inodelk_unref (l); - } - pthread_mutex_unlock (&pl_inode->mutex); + pthread_mutex_lock (&pl_inode->mutex); + { + __pl_inodelk_unref (l); + } + pthread_mutex_unlock (&pl_inode->mutex); inode_unref (pl_inode->inode); } + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + return 0; } static int pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, - pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, + pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, inode_t *inode) { posix_locks_private_t *priv = NULL; @@ -613,9 +740,12 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, pl_inode_lock_t *retlock = NULL; gf_boolean_t unref = _gf_true; gf_boolean_t need_inode_unref = _gf_false; + struct list_head *pcontend = NULL; + struct list_head contend; + struct timespec now = { }; short fl_type; - lock->pl_inode = pl_inode; + lock->pl_inode = pl_inode; fl_type = lock->fl_type; priv = this->private; @@ -657,12 +787,19 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, } } + if (priv->notify_contention) { + pcontend = &contend; + INIT_LIST_HEAD(pcontend); + timespec_now(&now); + } + if (ctx) pthread_mutex_lock (&ctx->lock); pthread_mutex_lock (&pl_inode->mutex); { if (lock->fl_type != F_UNLCK) { - ret = __lock_inodelk (this, pl_inode, lock, can_block, dom); + ret = __lock_inodelk (this, pl_inode, lock, can_block, + dom, &now, pcontend); if (ret == 0) { lock->frame = NULL; gf_log (this->name, GF_LOG_TRACE, @@ -725,13 +862,18 @@ out: */ if ((fl_type == F_UNLCK) && (ret == 0)) { inode_unref (pl_inode->inode); - grant_blocked_inode_locks (this, pl_inode, dom); + grant_blocked_inode_locks (this, pl_inode, dom, &now, + pcontend); } if (need_inode_unref) { inode_unref (pl_inode->inode); } + if (pcontend != NULL) { + inodelk_contention_notify(this, pcontend); + } + return ret; } @@ -771,7 +913,8 @@ new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, INIT_LIST_HEAD (&lock->list); INIT_LIST_HEAD (&lock->blocked_locks); - INIT_LIST_HEAD (&lock->client_list); + INIT_LIST_HEAD (&lock->client_list); + INIT_LIST_HEAD (&lock->contend); __pl_inodelk_ref (lock); return lock; diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 3d3b327f56c..c2edfff8f00 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -70,6 +70,7 @@ typedef struct __posix_lock posix_lock_t; struct __pl_inode_lock { struct list_head list; struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */ + struct list_head contend; /* list of contending locks */ int ref; short fl_type; @@ -86,6 +87,8 @@ struct __pl_inode_lock { struct timeval blkd_time; /*time at which lock was queued into blkd list*/ struct timeval granted_time; /*time at which lock was queued into active list*/ + /*last time at wich lock contention was detected and notified*/ + struct timespec contention_time; /* These two together serve to uniquely identify each process across nodes */ @@ -120,6 +123,7 @@ typedef struct _pl_dom_list pl_dom_list_t; struct __entry_lock { struct list_head domain_list; /* list_head back to pl_dom_list_t */ struct list_head blocked_locks; /* list_head back to blocked_entrylks */ + struct list_head contend; /* list of contending locks */ int ref; call_frame_t *frame; @@ -133,6 +137,8 @@ struct __entry_lock { struct timeval blkd_time; /*time at which lock was queued into blkd list*/ struct timeval granted_time; /*time at which lock was queued into active list*/ + /*last time at wich lock contention was detected and notified*/ + struct timespec contention_time; void *client; gf_lkowner_t owner; @@ -194,6 +200,8 @@ typedef struct { uint32_t revocation_secs; gf_boolean_t revocation_clear_all; uint32_t revocation_max_blocked; + gf_boolean_t notify_contention; + uint32_t notify_contention_delay; } posix_locks_private_t; diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h index 7a1e3f488e7..e5a276f35b5 100644 --- a/xlators/features/locks/src/pl-messages.h +++ b/xlators/features/locks/src/pl-messages.h @@ -24,7 +24,9 @@ */ GLFS_MSGID(PL, - PL_MSG_LOCK_NUMBER + PL_MSG_LOCK_NUMBER, + PL_MSG_INODELK_CONTENTION_FAILED, + PL_MSG_ENTRYLK_CONTENTION_FAILED ); #endif /* !_PL_MESSAGES_H_ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 78bf160058c..82d77db1164 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -3641,6 +3641,13 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("revocation-max-blocked", priv->revocation_max_blocked, options, uint32, out); + + GF_OPTION_RECONF ("notify-contention", priv->notify_contention, + options, bool, out); + + GF_OPTION_RECONF ("notify-contention-delay", + priv->notify_contention_delay, options, uint32, out); + ret = 0; out: @@ -3705,6 +3712,12 @@ init (xlator_t *this) GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked, uint32, out); + GF_OPTION_INIT ("notify-contention", priv->notify_contention, bool, + out); + + GF_OPTION_INIT ("notify-contention-delay", + priv->notify_contention_delay, uint32, out); + this->local_pool = mem_pool_new (pl_local_t, 32); if (!this->local_pool) { ret = -1; @@ -4461,5 +4474,32 @@ struct volume_options options[] = { "will be revoked to allow the others to proceed. Can " "be used in conjunction w/ revocation-clear-all." }, + { .key = {"notify-contention"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = { GD_OP_VERSION_4_0_0 }, + .tags = { "locks", "contention" }, + .description = "When this option is enabled and a lock request " + "conflicts with a currently granted lock, an upcall " + "notification will be sent to the current owner of " + "the lock to request it to be released as soon as " + "possible." + }, + { .key = {"notify-contention-delay"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, /* An upcall notification is sent every time a conflict is + * detected. */ + .max = 60, + .default_value = "5", + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC, + .op_version = { GD_OP_VERSION_4_0_0 }, + .tags = { "locks", "contention", "timeout" }, + .description = "This value determines the minimum amount of time " + "(in seconds) between upcall contention notifications " + "on the same inode. If multiple lock requests are " + "received during this period, only one upcall will " + "be sent." + }, { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index da85cb5297f..e0ec3368ca7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1484,6 +1484,16 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_13_0, .flags = VOLOPT_FLAG_CLIENT_OPT }, + { .key = "disperse.eager-lock-timeout", + .voltype = "cluster/disperse", + .op_version = GD_OP_VERSION_4_0_0, + .flags = VOLOPT_FLAG_CLIENT_OPT + }, + { .key = "disperse.other-eager-lock-timeout", + .voltype = "cluster/disperse", + .op_version = GD_OP_VERSION_4_0_0, + .flags = VOLOPT_FLAG_CLIENT_OPT + }, { .key = "cluster.quorum-type", .voltype = "cluster/replicate", .option = "quorum-type", @@ -3489,6 +3499,16 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_9_0, .type = NO_DOC, }, + { .option = "notify-contention", + .key = "features.locks-notify-contention", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_4_0_0, + }, + { .option = "notify-contention-delay", + .key = "features.locks-notify-contention-delay", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_4_0_0, + }, { .key = "disperse.shd-max-threads", .voltype = "cluster/disperse", .op_version = GD_OP_VERSION_3_9_0, diff --git a/xlators/protocol/client/src/client-callback.c b/xlators/protocol/client/src/client-callback.c index 51164e57230..b2f9a225887 100644 --- a/xlators/protocol/client/src/client-callback.c +++ b/xlators/protocol/client/src/client-callback.c @@ -173,6 +173,101 @@ out: return 0; } +int +client_cbk_inodelk_contention (struct rpc_clnt *rpc, void *mydata, void *data) +{ + int ret = -1; + struct iovec *iov = NULL; + struct gf_upcall upcall_data = {0,}; + struct gf_upcall_inodelk_contention lc = {{0,},}; + gfs4_inodelk_contention_req proto_lc = {{0,},}; + + GF_VALIDATE_OR_GOTO ("client-callback", rpc, out); + GF_VALIDATE_OR_GOTO ("client-callback", mydata, out); + GF_VALIDATE_OR_GOTO ("client-callback", data, out); + + iov = (struct iovec *)data; + ret = xdr_to_generic (*iov, &proto_lc, + (xdrproc_t)xdr_gfs4_inodelk_contention_req); + + if (ret < 0) { + gf_msg (THIS->name, GF_LOG_WARNING, -ret, + PC_MSG_INODELK_CONTENTION_FAIL, + "XDR decode of inodelk contention failed."); + goto out; + } + + upcall_data.data = &lc; + ret = gf_proto_inodelk_contention_to_upcall (&proto_lc, &upcall_data); + if (ret < 0) + goto out; + + upcall_data.event_type = GF_UPCALL_INODELK_CONTENTION; + + default_notify (THIS, GF_EVENT_UPCALL, &upcall_data); + +out: + if (proto_lc.domain) + free (proto_lc.domain); + + if (proto_lc.xdata.xdata_val) + free (proto_lc.xdata.xdata_val); + + if (lc.xdata) + dict_unref (lc.xdata); + + return ret; +} + +int +client_cbk_entrylk_contention (struct rpc_clnt *rpc, void *mydata, void *data) +{ + int ret = -1; + struct iovec *iov = NULL; + struct gf_upcall upcall_data = {0,}; + struct gf_upcall_entrylk_contention lc = {0,}; + gfs4_entrylk_contention_req proto_lc = {{0,},}; + + GF_VALIDATE_OR_GOTO ("client-callback", rpc, out); + GF_VALIDATE_OR_GOTO ("client-callback", mydata, out); + GF_VALIDATE_OR_GOTO ("client-callback", data, out); + + iov = (struct iovec *)data; + ret = xdr_to_generic (*iov, &proto_lc, + (xdrproc_t)xdr_gfs4_entrylk_contention_req); + + if (ret < 0) { + gf_msg (THIS->name, GF_LOG_WARNING, -ret, + PC_MSG_ENTRYLK_CONTENTION_FAIL, + "XDR decode of entrylk contention failed."); + goto out; + } + + upcall_data.data = &lc; + ret = gf_proto_entrylk_contention_to_upcall (&proto_lc, &upcall_data); + if (ret < 0) + goto out; + + upcall_data.event_type = GF_UPCALL_ENTRYLK_CONTENTION; + + default_notify (THIS, GF_EVENT_UPCALL, &upcall_data); + +out: + if (proto_lc.name) + free (proto_lc.name); + + if (proto_lc.domain) + free (proto_lc.domain); + + if (proto_lc.xdata.xdata_val) + free (proto_lc.xdata.xdata_val); + + if (lc.xdata) + dict_unref (lc.xdata); + + return ret; +} + rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = { [GF_CBK_NULL] = {"NULL", GF_CBK_NULL, client_cbk_null }, [GF_CBK_FETCHSPEC] = {"FETCHSPEC", GF_CBK_FETCHSPEC, client_cbk_fetchspec }, @@ -181,6 +276,8 @@ rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = { [GF_CBK_CHILD_UP] = {"CHILD_UP", GF_CBK_CHILD_UP, client_cbk_child_up }, [GF_CBK_CHILD_DOWN] = {"CHILD_DOWN", GF_CBK_CHILD_DOWN, client_cbk_child_down }, [GF_CBK_RECALL_LEASE] = {"RECALL_LEASE", GF_CBK_RECALL_LEASE, client_cbk_recall_lease }, + [GF_CBK_INODELK_CONTENTION] = {"INODELK_CONTENTION", GF_CBK_INODELK_CONTENTION, client_cbk_inodelk_contention }, + [GF_CBK_ENTRYLK_CONTENTION] = {"ENTRYLK_CONTENTION", GF_CBK_ENTRYLK_CONTENTION, client_cbk_entrylk_contention }, }; diff --git a/xlators/protocol/client/src/client-messages.h b/xlators/protocol/client/src/client-messages.h index 86b721bd593..5f146c67efe 100644 --- a/xlators/protocol/client/src/client-messages.h +++ b/xlators/protocol/client/src/client-messages.h @@ -89,7 +89,9 @@ GLFS_MSGID(PC, PC_MSG_CACHE_INVALIDATION_FAIL, PC_MSG_CHILD_STATUS, PC_MSG_GFID_NULL, - PC_MSG_RECALL_LEASE_FAIL + PC_MSG_RECALL_LEASE_FAIL, + PC_MSG_INODELK_CONTENTION_FAIL, + PC_MSG_ENTRYLK_CONTENTION_FAIL ); #endif /* !_PC_MESSAGES_H__ */ diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index 7154355e690..66122318c79 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -1349,6 +1349,8 @@ server_process_event_upcall (xlator_t *this, void *data) enum gf_cbk_procnum cbk_procnum = GF_CBK_NULL; gfs3_cbk_cache_invalidation_req gf_c_req = {0,}; gfs3_recall_lease_req gf_recall_lease = {{0,},}; + gfs4_inodelk_contention_req gf_inodelk_contention = {{0},}; + gfs4_entrylk_contention_req gf_entrylk_contention = {{0},}; xdrproc_t xdrproc; GF_VALIDATE_OR_GOTO(this->name, data, out); @@ -1358,7 +1360,16 @@ server_process_event_upcall (xlator_t *this, void *data) upcall_data = (struct gf_upcall *)data; client_uid = upcall_data->client_uid; - GF_VALIDATE_OR_GOTO(this->name, client_uid, out); + /* client_uid could be NULL if the upcall was intended for a server's + * child xlator (so no client_uid available) but it hasn't handled + * the notification. For this reason we silently ignore any upcall + * request with a NULL client_uid, but -1 will be returned. + */ + if (client_uid == NULL) { + gf_msg_debug(this->name, 0, + "NULL client_uid for an upcall request"); + goto out; + } switch (upcall_data->event_type) { case GF_UPCALL_CACHE_INVALIDATION: @@ -1381,6 +1392,28 @@ server_process_event_upcall (xlator_t *this, void *data) cbk_procnum = GF_CBK_RECALL_LEASE; xdrproc = (xdrproc_t)xdr_gfs3_recall_lease_req; break; + case GF_UPCALL_INODELK_CONTENTION: + ret = gf_proto_inodelk_contention_from_upcall (this, + &gf_inodelk_contention, + upcall_data); + if (ret < 0) + goto out; + + up_req = &gf_inodelk_contention; + cbk_procnum = GF_CBK_INODELK_CONTENTION; + xdrproc = (xdrproc_t)xdr_gfs4_inodelk_contention_req; + break; + case GF_UPCALL_ENTRYLK_CONTENTION: + ret = gf_proto_entrylk_contention_from_upcall (this, + &gf_entrylk_contention, + upcall_data); + if (ret < 0) + goto out; + + up_req = &gf_entrylk_contention; + cbk_procnum = GF_CBK_ENTRYLK_CONTENTION; + xdrproc = (xdrproc_t)xdr_gfs4_entrylk_contention_req; + break; default: gf_msg (this->name, GF_LOG_WARNING, EINVAL, PS_MSG_INVALID_ENTRY, @@ -1417,11 +1450,10 @@ server_process_event_upcall (xlator_t *this, void *data) pthread_mutex_unlock (&conf->mutex); ret = 0; out: - if ((gf_c_req.xdata).xdata_val) - GF_FREE ((gf_c_req.xdata).xdata_val); - - if ((gf_recall_lease.xdata).xdata_val) - GF_FREE ((gf_recall_lease.xdata).xdata_val); + GF_FREE ((gf_c_req.xdata).xdata_val); + GF_FREE ((gf_recall_lease.xdata).xdata_val); + GF_FREE ((gf_inodelk_contention.xdata).xdata_val); + GF_FREE ((gf_entrylk_contention.xdata).xdata_val); return ret; } |