diff options
Diffstat (limited to 'xlators/features/locks/src/common.c')
| -rw-r--r-- | xlators/features/locks/src/common.c | 578 |
1 files changed, 545 insertions, 33 deletions
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 5ad5415ed79..a2c6be93e03 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -12,11 +12,10 @@ #include <limits.h> #include <pthread.h> -#include "glusterfs.h" -#include "compat.h" -#include "xlator.h" -#include "logging.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/compat.h> +#include <glusterfs/logging.h> +#include <glusterfs/syncop.h> #include "locks.h" #include "common.h" @@ -213,13 +212,11 @@ void pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; + posix_locks_private_t *priv = this->private; char pl_locker[256]; char pl_lockee[256]; char pl_lock[256]; - priv = this->private; - if (!priv->trace) return; @@ -291,13 +288,11 @@ pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd, struct gf_flock *flock, const char *domain) { - posix_locks_private_t *priv = NULL; + posix_locks_private_t *priv = this->private; char pl_locker[256]; char pl_lockee[256]; char pl_lock[256]; - priv = this->private; - if (!priv->trace) return; @@ -326,7 +321,7 @@ pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd) if (!priv->trace) return; - pl_inode = pl_inode_get(this, fd->inode); + pl_inode = pl_inode_get(this, fd->inode, NULL); if (pl_inode && __pl_inode_is_empty(pl_inode)) return; @@ -362,7 +357,7 @@ pl_update_refkeeper(xlator_t *this, inode_t *inode) int need_unref = 0; int need_ref = 0; - pl_inode = pl_inode_get(this, inode); + pl_inode = pl_inode_get(this, inode, NULL); if (!pl_inode) return; @@ -389,8 +384,51 @@ pl_update_refkeeper(xlator_t *this, inode_t *inode) inode_ref(inode); } +/* Get lock enforcement info from disk */ +int +pl_fetch_mlock_info_from_disk(xlator_t *this, pl_inode_t *pl_inode, + pl_local_t *local) +{ + dict_t *xdata_rsp = NULL; + int ret = 0; + int op_ret = 0; + + if (!local) { + return -1; + } + + if (local->fd) { + op_ret = syncop_fgetxattr(this, local->fd, &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } else { + op_ret = syncop_getxattr(this, &local->loc[0], &xdata_rsp, + GF_ENFORCE_MANDATORY_LOCK, NULL, NULL); + } + + pthread_mutex_lock(&pl_inode->mutex); + { + if (op_ret >= 0) { + pl_inode->mlock_enforced = _gf_true; + pl_inode->check_mlock_info = _gf_false; + } else { + gf_msg(this->name, GF_LOG_WARNING, -op_ret, 0, + "getxattr failed with %d", op_ret); + pl_inode->mlock_enforced = _gf_false; + + if (-op_ret == ENODATA) { + pl_inode->check_mlock_info = _gf_false; + } else { + pl_inode->check_mlock_info = _gf_true; + } + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + return ret; +} + pl_inode_t * -pl_inode_get(xlator_t *this, inode_t *inode) +pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) { uint64_t tmp_pl_inode = 0; pl_inode_t *pl_inode = NULL; @@ -403,6 +441,7 @@ pl_inode_get(xlator_t *this, inode_t *inode) pl_inode = (pl_inode_t *)(long)tmp_pl_inode; goto unlock; } + pl_inode = GF_CALLOC(1, sizeof(*pl_inode), gf_locks_mt_pl_inode_t); if (!pl_inode) { goto unlock; @@ -411,6 +450,7 @@ pl_inode_get(xlator_t *this, inode_t *inode) gf_log(this->name, GF_LOG_TRACE, "Allocating new pl inode"); pthread_mutex_init(&pl_inode->mutex, NULL); + pthread_cond_init(&pl_inode->check_fop_wind_count, 0); INIT_LIST_HEAD(&pl_inode->dom_list); INIT_LIST_HEAD(&pl_inode->ext_list); @@ -420,8 +460,16 @@ pl_inode_get(xlator_t *this, inode_t *inode) INIT_LIST_HEAD(&pl_inode->blocked_calls); INIT_LIST_HEAD(&pl_inode->metalk_list); INIT_LIST_HEAD(&pl_inode->queued_locks); + INIT_LIST_HEAD(&pl_inode->waiting); gf_uuid_copy(pl_inode->gfid, inode->gfid); + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + + /* -2 means never looked up. -1 means something went wrong and link + * tracking is disabled. */ + pl_inode->links = -2; + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); if (ret) { pthread_mutex_destroy(&pl_inode->mutex); @@ -433,13 +481,23 @@ pl_inode_get(xlator_t *this, inode_t *inode) unlock: UNLOCK(&inode->lock); + if ((pl_inode != NULL) && pl_is_mandatory_locking_enabled(pl_inode) && + pl_inode->check_mlock_info && local) { + /* Note: The lock enforcement information per file can be stored in the + attribute flag of stat(x) in posix. With that there won't be a need + for doing getxattr post a reboot + */ + pl_fetch_mlock_info_from_disk(this, pl_inode, local); + } + return pl_inode; } /* Create a new posix_lock_t */ posix_lock_t * new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, - gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking) + gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking, + int32_t *op_errno) { posix_lock_t *lock = NULL; @@ -447,8 +505,14 @@ new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, GF_VALIDATE_OR_GOTO("posix-locks", client, out); GF_VALIDATE_OR_GOTO("posix-locks", fd, out); + if (!pl_is_lk_owner_valid(owner, client)) { + *op_errno = EINVAL; + goto out; + } + lock = GF_CALLOC(1, sizeof(posix_lock_t), gf_locks_mt_posix_lock_t); if (!lock) { + *op_errno = ENOMEM; goto out; } @@ -466,6 +530,7 @@ new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid, if (lock->client_uid == NULL) { GF_FREE(lock); lock = NULL; + *op_errno = ENOMEM; goto out; } @@ -540,13 +605,11 @@ static void __insert_lock(pl_inode_t *pl_inode, posix_lock_t *lock) { if (lock->blocked) - gettimeofday(&lock->blkd_time, NULL); + lock->blkd_time = gf_time(); else - gettimeofday(&lock->granted_time, NULL); + lock->granted_time = gf_time(); list_add_tail(&lock->list, &pl_inode->ext_list); - - return; } /* Return true if the locks overlap, false otherwise */ @@ -902,7 +965,7 @@ grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode) struct list_head granted_list; posix_lock_t *tmp = NULL; posix_lock_t *lock = NULL; - + pl_local_t *local = NULL; INIT_LIST_HEAD(&granted_list); pthread_mutex_lock(&pl_inode->mutex); @@ -917,9 +980,9 @@ grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode) pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, 0, 0, NULL); - - STACK_UNWIND_STRICT(lk, lock->frame, 0, 0, &lock->user_flock, NULL); - + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); __destroy_lock(lock); } @@ -934,10 +997,12 @@ pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, 0, }; posix_lock_t *unlock_lock = NULL; + int32_t op_errno = 0; struct list_head granted_list; posix_lock_t *tmp = NULL; posix_lock_t *lock = NULL; + pl_local_t *local = NULL; int ret = -1; @@ -951,7 +1016,7 @@ pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, unlock_lock = new_posix_lock(&flock, old_lock->client, old_lock->client_pid, &old_lock->owner, old_lock->fd, - old_lock->lk_flags, 0); + old_lock->lk_flags, 0, &op_errno); GF_VALIDATE_OR_GOTO(this->name, unlock_lock, out); ret = 0; @@ -965,9 +1030,9 @@ pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode, pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, 0, 0, NULL); - - STACK_UNWIND_STRICT(lk, lock->frame, 0, 0, &lock->user_flock, NULL); - + local = lock->frame->local; + PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0, + &lock->user_flock, NULL); __destroy_lock(lock); } @@ -1002,7 +1067,7 @@ pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, if (__is_lock_grantable(pl_inode, lock)) { if (pl_metalock_is_active(pl_inode)) { - __pl_queue_lock(pl_inode, lock, can_block); + __pl_queue_lock(pl_inode, lock); pthread_mutex_unlock(&pl_inode->mutex); ret = -2; goto out; @@ -1015,7 +1080,7 @@ pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, __insert_and_merge(pl_inode, lock); } else if (can_block) { if (pl_metalock_is_active(pl_inode)) { - __pl_queue_lock(pl_inode, lock, can_block); + __pl_queue_lock(pl_inode, lock); pthread_mutex_unlock(&pl_inode->mutex); ret = -2; goto out; @@ -1026,6 +1091,10 @@ pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, lkowner_utoa(&lock->owner), lock->user_flock.l_start, lock->user_flock.l_len); + + pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, + &lock->user_flock, NULL); + lock->blocked = 1; __insert_lock(pl_inode, lock); ret = -1; @@ -1052,10 +1121,7 @@ out: posix_lock_t * pl_getlk(pl_inode_t *pl_inode, posix_lock_t *lock) { - posix_lock_t *conf = NULL; - - conf = first_conflicting_overlap(pl_inode, lock); - + posix_lock_t *conf = first_conflicting_overlap(pl_inode, lock); if (conf == NULL) { lock->fl_type = F_UNLCK; return lock; @@ -1077,3 +1143,449 @@ pl_does_monkey_want_stuck_lock() return _gf_true; return _gf_false; } + +int +pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock) +{ + posix_lock_t *lock = NULL; + posix_lock_t *i = NULL; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *itr = NULL; + struct list_head unwind_blist = { + 0, + }; + struct list_head unwind_rw_list = { + 0, + }; + int ret = 0; + + INIT_LIST_HEAD(&unwind_blist); + INIT_LIST_HEAD(&unwind_rw_list); + + pthread_mutex_lock(&pl_inode->mutex); + { + /* + - go through the lock list + - remove all locks from different owners + - same owner locks will be added or substracted based on + the new request + - add the new lock + */ + list_for_each_entry_safe(lock, i, &pl_inode->ext_list, list) + { + if (lock->blocked) { + list_del_init(&lock->list); + list_add(&lock->list, &unwind_blist); + continue; + } + + if (locks_overlap(lock, reqlock)) { + if (same_owner(lock, reqlock)) + continue; + + /* remove conflicting locks */ + list_del_init(&lock->list); + __delete_lock(lock); + __destroy_lock(lock); + } + } + + __insert_and_merge(pl_inode, reqlock); + + list_for_each_entry_safe(rw, itr, &pl_inode->rw_list, list) + { + list_del_init(&rw->list); + list_add(&rw->list, &unwind_rw_list); + } + } + pthread_mutex_unlock(&pl_inode->mutex); + + /* unwind blocked locks */ + list_for_each_entry_safe(lock, i, &unwind_blist, list) + { + PL_STACK_UNWIND_AND_FREE(((pl_local_t *)lock->frame->local), lk, + lock->frame, -1, EBUSY, &lock->user_flock, + NULL); + __destroy_lock(lock); + } + + /* unwind blocked IOs */ + list_for_each_entry_safe(rw, itr, &unwind_rw_list, list) + { + pl_clean_local(rw->stub->frame->local); + call_unwind_error(rw->stub, -1, EBUSY); + } + + return ret; +} + +/* Return true in case we need to ensure mandatory-locking + * semantics under different modes. + */ +gf_boolean_t +pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode) +{ + posix_locks_private_t *priv = THIS->private; + + if (priv->mandatory_mode == MLK_FILE_BASED && pl_inode->mandatory) + return _gf_true; + else if (priv->mandatory_mode == MLK_FORCED || + priv->mandatory_mode == MLK_OPTIMAL) + return _gf_true; + + return _gf_false; +} + +void +pl_clean_local(pl_local_t *local) +{ + if (!local) + return; + + if (local->inodelk_dom_count_req) + data_unref(local->inodelk_dom_count_req); + loc_wipe(&local->loc[0]); + loc_wipe(&local->loc[1]); + if (local->fd) + fd_unref(local->fd); + if (local->inode) + inode_unref(local->inode); + mem_put(local); +} + +/* +TODO: detach local initialization from PL_LOCAL_GET_REQUESTS and add it here +*/ +int +pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + pl_local_t *local = NULL; + + if (!loc && !fd) { + return -1; + } + + if (!frame->local) { + local = mem_get0(this->local_pool); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "mem allocation failed"); + return -1; + } + + local->inode = (loc ? inode_ref(loc->inode) : inode_ref(fd->inode)); + + frame->local = local; + } + + return 0; +} + +gf_boolean_t +pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client) +{ + if (client && (client->opversion < GD_OP_VERSION_7_0)) { + return _gf_true; + } + + if (is_lk_owner_null(owner)) { + return _gf_false; + } + return _gf_true; +} + +static int32_t +pl_inode_from_loc(loc_t *loc, inode_t **pinode) +{ + inode_t *inode = NULL; + int32_t error = 0; + + if (loc->inode != NULL) { + inode = inode_ref(loc->inode); + goto done; + } + + if (loc->parent == NULL) { + error = EINVAL; + goto done; + } + + if (!gf_uuid_is_null(loc->gfid)) { + inode = inode_find(loc->parent->table, loc->gfid); + if (inode != NULL) { + goto done; + } + } + + if (loc->name == NULL) { + error = EINVAL; + goto done; + } + + inode = inode_grep(loc->parent->table, loc->parent, loc->name); + if (inode == NULL) { + /* We haven't found any inode. This means that the file doesn't exist + * or that even if it exists, we don't have any knowledge about it, so + * we don't have locks on it either, which is fine for our purposes. */ + goto done; + } + +done: + *pinode = inode; + + return error; +} + +static gf_boolean_t +pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode, + struct timespec *now, struct list_head *contend) +{ + pl_dom_list_t *dom; + pl_inode_lock_t *lock; + gf_boolean_t has_owners = _gf_false; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(lock, &dom->inodelk_list, list) + { + /* If the lock belongs to the same client, we assume it's related + * to the same operation, so we allow the removal to continue. */ + if (lock->client == client) { + continue; + } + /* If the lock belongs to an internal process, we don't block the + * removal. */ + if (lock->client_pid < 0) { + continue; + } + if (contend == NULL) { + return _gf_true; + } + has_owners = _gf_true; + inodelk_contention_notify_check(xl, lock, now, contend); + } + } + + return has_owners; +} + +int32_t +pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, + pl_inode_t **ppl_inode, struct list_head *contend) +{ + struct timespec now; + inode_t *inode; + pl_inode_t *pl_inode; + int32_t error; + + pl_inode = NULL; + + error = pl_inode_from_loc(loc, &inode); + if ((error != 0) || (inode == NULL)) { + goto done; + } + + pl_inode = pl_inode_get(xl, inode, NULL); + if (pl_inode == NULL) { + inode_unref(inode); + error = ENOMEM; + goto done; + } + + /* pl_inode_from_loc() already increments ref count for inode, so + * we only assign here our reference. */ + pl_inode->inode = inode; + + timespec_now(&now); + + pthread_mutex_lock(&pl_inode->mutex); + + if (pl_inode->removed) { + error = ESTALE; + goto unlock; + } + + if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { + error = -1; + /* We skip the unlock here because the caller must create a stub when + * we return -1 and do a call to pl_inode_remove_complete(), which + * assumes the lock is still acquired and will release it once + * everything else is prepared. */ + goto done; + } + + pl_inode->is_locked = _gf_true; + pl_inode->remove_running++; + +unlock: + pthread_mutex_unlock(&pl_inode->mutex); + +done: + *ppl_inode = pl_inode; + + return error; +} + +int32_t +pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, + struct list_head *contend) +{ + pl_inode_lock_t *lock; + int32_t error = -1; + + if (stub != NULL) { + list_add_tail(&stub->list, &pl_inode->waiting); + pl_inode->is_locked = _gf_true; + } else { + error = ENOMEM; + + while (!list_empty(contend)) { + lock = list_first_entry(contend, pl_inode_lock_t, list); + list_del_init(&lock->list); + __pl_inodelk_unref(lock); + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + if (error < 0) { + inodelk_contention_notify(xl, contend); + } + + inode_unref(pl_inode->inode); + + return error; +} + +void +pl_inode_remove_wake(struct list_head *list) +{ + call_stub_t *stub; + + while (!list_empty(list)) { + stub = list_first_entry(list, call_stub_t, list); + list_del_init(&stub->list); + + call_resume(stub); + } +} + +void +pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) +{ + struct list_head contend, granted; + struct timespec now; + pl_dom_list_t *dom; + + if (pl_inode == NULL) { + return; + } + + INIT_LIST_HEAD(&contend); + INIT_LIST_HEAD(&granted); + timespec_now(&now); + + pthread_mutex_lock(&pl_inode->mutex); + + if (error == 0) { + if (pl_inode->links >= 0) { + pl_inode->links--; + } + if (pl_inode->links == 0) { + pl_inode->removed = _gf_true; + } + } + + pl_inode->remove_running--; + + if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { + pl_inode->is_locked = _gf_false; + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, + &contend); + } + } + + pthread_mutex_unlock(&pl_inode->mutex); + + unwind_granted_inodes(xl, pl_inode, &granted); + + inodelk_contention_notify(xl, &contend); + + inode_unref(pl_inode->inode); +} + +void +pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, + struct list_head *list) +{ + call_stub_t *stub, *tmp; + + if (!pl_inode->is_locked) { + return; + } + + list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list) + { + if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL, + NULL)) { + list_move_tail(&stub->list, list); + } + } +} + +/* This function determines if an inodelk attempt can be done now or it needs + * to wait. + * + * Possible return values: + * < 0: An error occurred. Currently only -ESTALE can be returned if the + * inode has been deleted previously by unlink/rmdir/rename + * = 0: The lock can be attempted. + * > 0: The lock needs to wait because a conflicting remove operation is + * ongoing. + */ +int32_t +pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) +{ + pl_dom_list_t *dom; + pl_inode_lock_t *ilock; + + /* If the inode has been deleted, we won't allow any lock. */ + if (pl_inode->removed) { + return -ESTALE; + } + + /* We only synchronize with locks made for regular operations coming from + * the user. Locks done for internal purposes are hard to control and could + * lead to long delays or deadlocks quite easily. */ + if (lock->client_pid < 0) { + return 0; + } + if (!pl_inode->is_locked) { + return 0; + } + if (pl_inode->remove_running > 0) { + return 1; + } + + list_for_each_entry(dom, &pl_inode->dom_list, inode_list) + { + list_for_each_entry(ilock, &dom->inodelk_list, list) + { + /* If a lock from the same client is already granted, we allow this + * one to continue. This is necessary to prevent deadlocks when + * multiple locks are taken for the same operation. + * + * On the other side it's unlikely that the same client sends + * completely unrelated locks for the same inode. + */ + if (ilock->client == lock->client) { + return 0; + } + } + } + + return 1; +} |
