From 73835a06e87f685354816fb6c2ca4a9918f5e314 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Wed, 31 Dec 2014 15:15:53 +0530 Subject: cluster/afr: serialize inode locks Backport of http://review.gluster.org/9372 Problem: Afr winds inodelk calls without any order, so blocking inodelks from two different mounts can lead to dead lock when mount1 gets the lock on brick-1 and blocked on brick-2 where as mount2 gets lock on brick-2 and blocked on brick-1 Fix: Serialize the inodelks whether they are blocking inodelks or non-blocking inodelks. Non-blocking locks also need to be serialized. Otherwise there is a chance that both the mounts which issued same non-blocking inodelk may endup not acquiring the lock on any-brick. Ex: Mount1 and Mount2 request for full length lock on file f1. Mount1 afr may acquire the partial lock on brick-1 and may not acquire the lock on brick-2 because Mount2 already got the lock on brick-2, vice versa. Since both the mounts only got partial locks, afr treats them as failure in gaining the locks and unwinds with EAGAIN errno. BUG: 1189023 Change-Id: If5dd502d9d25d12425749a8efcf08a1423b29255 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/9576 Reviewed-by: Krutika Dhananjay Tested-by: Gluster Build System Reviewed-by: Raghavendra Bhat --- xlators/cluster/afr/src/afr-common.c | 244 ++++++++++++++++++++++++++--------- xlators/cluster/afr/src/afr.h | 2 +- 2 files changed, 186 insertions(+), 60 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index fde86a459a7..ce287aa28e9 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1037,6 +1037,10 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.readdir.dict); } + { /* inodelk */ + GF_FREE (local->cont.inodelk.volume); + } + if (local->xdata_req) dict_unref (local->xdata_req); @@ -2753,8 +2757,9 @@ out: /* }}} */ int32_t -afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_unlock_partial_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2804,7 +2809,7 @@ afr_unlock_inodelks_and_unwind (call_frame_t *frame, xlator_t *this, if (local->replies[i].op_ret == -1) continue; - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + STACK_WIND_COOKIE (frame, afr_unlock_partial_inodelk_cbk, (void*) (long) i, priv->children[i], priv->children[i]->fops->inodelk, @@ -2820,20 +2825,61 @@ afr_unlock_inodelks_and_unwind (call_frame_t *frame, xlator_t *this, } int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) - +afr_inodelk_done (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = (long)cookie; int i = 0; int lock_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret == 0) + lock_count++; + + if (local->op_ret == -1 && local->op_errno == EAGAIN) + continue; + + if ((local->replies[i].op_ret == -1) && + (local->replies[i].op_errno == EAGAIN)) { + local->op_ret = -1; + local->op_errno = EAGAIN; + continue; + } + + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + + local->op_errno = local->replies[i].op_errno; + } + + if (lock_count && local->cont.inodelk.flock.l_type != F_UNLCK && + (local->op_ret == -1 && local->op_errno == EAGAIN)) { + afr_unlock_inodelks_and_unwind (frame, this, + lock_count); + } else { + AFR_STACK_UNWIND (inodelk, frame, local->op_ret, + local->op_errno, local->xdata_rsp); + } + + return 0; +} + +int +afr_common_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + + local = frame->local; + local->replies[child_index].valid = 1; local->replies[child_index].op_ret = op_ret; local->replies[child_index].op_errno = op_errno; @@ -2846,86 +2892,166 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); } + return 0; +} + +static int32_t +afr_parallel_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ + int call_count = 0; + + afr_common_inodelk_cbk (frame, cookie, this, op_ret, op_errno, xdata); call_count = afr_frame_return (frame); + if (call_count == 0) + afr_inodelk_done (frame, this); - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; + return 0; +} - if (local->replies[i].op_ret == 0) - lock_count++; +static inline gf_boolean_t +afr_is_conflicting_lock_present (int32_t op_ret, int32_t op_errno) +{ + if (op_ret == -1 && op_errno == EAGAIN) + return _gf_true; + return _gf_false; +} - if (local->op_ret == -1 && local->op_errno == EAGAIN) - continue; +static int32_t +afr_serialized_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) - if ((local->replies[i].op_ret == -1) && - (local->replies[i].op_errno == EAGAIN)) { - local->op_ret = -1; - local->op_errno = EAGAIN; - continue; - } +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int next_child = 0; - if (local->replies[i].op_ret == 0) - local->op_ret = 0; + local = frame->local; + priv = this->private; - local->op_errno = local->replies[i].op_errno; - } + afr_common_inodelk_cbk (frame, cookie, this, op_ret, op_errno, xdata); - if (lock_count && local->cont.inodelk.flock.l_type != F_UNLCK && - (local->op_ret == -1 && local->op_errno == EAGAIN)) { - afr_unlock_inodelks_and_unwind (frame, this, - lock_count); - } else { - AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno, local->xdata_rsp); - } + for (next_child = child_index + 1; next_child < priv->child_count; + next_child++) { + if (local->child_up[next_child]) + break; } + if (afr_is_conflicting_lock_present (op_ret, op_errno) || + (next_child == priv->child_count)) { + afr_inodelk_done (frame, this); + } else { + STACK_WIND_COOKIE (frame, afr_serialized_inodelk_cbk, + (void *) (long) next_child, + priv->children[next_child], + priv->children[next_child]->fops->inodelk, + (const char *)local->cont.inodelk.volume, + &local->loc, local->cont.inodelk.cmd, + &local->cont.inodelk.flock, + local->xdata_req); + } + + return 0; +} + +static int +afr_parallel_inodelk_wind (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + STACK_WIND_COOKIE (frame, afr_parallel_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + (const char *)local->cont.inodelk.volume, + &local->loc, local->cont.inodelk.cmd, + &local->cont.inodelk.flock, + local->xdata_req); + if (!--call_count) + break; + } return 0; } +static int +afr_serialized_inodelk_wind (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_serialized_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + (const char *)local->cont.inodelk.volume, + &local->loc, local->cont.inodelk.cmd, + &local->cont.inodelk.flock, + local->xdata_req); + break; + } + } + return 0; +} int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; - int i = 0; - int32_t call_count = 0; int32_t op_errno = ENOMEM; - priv = this->private; - local = AFR_FRAME_INIT (frame, op_errno); if (!local) goto out; loc_copy (&local->loc, loc); - local->cont.inodelk.volume = volume; + local->cont.inodelk.volume = gf_strdup (volume); + if (!local->cont.inodelk.volume) { + op_errno = ENOMEM; + goto out; + } + local->cont.inodelk.cmd = cmd; local->cont.inodelk.flock = *flock; + if (xdata) + local->xdata_req = dict_ref (xdata); - call_count = local->call_count; - if (!call_count) { - op_errno = ENOMEM; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_inodelk_cbk, - (void*) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - volume, loc, cmd, flock, xdata); - - if (!--call_count) - break; - } + /* At least one child is up */ + /* + * Non-blocking locks also need to be serialized. Otherwise there is + * a chance that both the mounts which issued same non-blocking inodelk + * may endup not acquiring the lock on any-brick. + * Ex: Mount1 and Mount2 + * request for full length lock on file f1. Mount1 afr may acquire the + * partial lock on brick-1 and may not acquire the lock on brick-2 + * because Mount2 already got the lock on brick-2, vice versa. Since + * both the mounts only got partial locks, afr treats them as failure in + * gaining the locks and unwinds with EAGAIN errno. + */ + if (flock->l_type == F_UNLCK) { + afr_parallel_inodelk_wind (frame, this); + } else { + afr_serialized_inodelk_wind (frame, this); } return 0; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9448471752b..f32ab9a9e71 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -624,7 +624,7 @@ typedef struct _afr_local { } zerofill; struct { - const char *volume; + char *volume; int32_t cmd; struct gf_flock flock; } inodelk; -- cgit