From 1939519979b6c4da110f08434fe63d0b03332d52 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 25 Jan 2013 19:22:55 -0800 Subject: replicate: fix lock counting in blocking lock path As of http://review.gluster.org/2828, the blocking lock code path's condition for checking completion of locking atempt is broken. The condition - if ((child_index == priv->child_count) || ...) and if ((child_index == priv->child_count) && ...) which is retained to check completion of blocking lock attempts for DATA/METADATA transaction will _always_ fail because a few lines above we have - child_index = cookie % priv->child_count; So child_index will never equal priv->child_count. This leaves the correctness at the mercy of the next part of the conditional - .. (int_lock->lock_count == int_lock->lk_expected_count) .. This "works" as long as no server went down during the transaction. If a server goes down in the middle of the transaction, then this condition also fails, and the code wraps around and starts a blocking lock attempt loop all the way again from from the first server. This results in double locks getting acquired on those servers, and eventually the second condition gets hit (first condition is _never_ hit) and we come out of locking phase. During unlock phase we perform only one unlock per server leaving the other lock "leaked" forever. Change-Id: I7189cdf3f70901b04647516fe1d1e189f36cc8dd BUG: 765564 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.org/4433 Reviewed-by: Krishnan Parthasarathi Reviewed-by: Pranith Kumar Karampuri Tested-by: Gluster Build System --- xlators/cluster/afr/src/afr-lk-common.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'xlators/cluster/afr/src/afr-lk-common.c') diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index ef183d985..802f91f5f 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -913,6 +913,8 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); @@ -1072,7 +1074,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) child_index++; } - if ((child_index == priv->child_count) && + if ((int_lock->lk_expected_count == int_lock->lk_attempted_count) && ((afr_is_entrylk (int_lock, local->transaction.type) && int_lock->entrylk_lock_count == 0) || (int_lock->lock_count == 0))){ @@ -1091,10 +1093,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) } - if ((child_index == priv->child_count) || - (int_lock->entrylk_lock_count == int_lock->lk_expected_count) || - (int_lock->lock_count == int_lock->lk_expected_count)) { - + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, -- cgit