From f9698036fcc1ceedea19110139400d0cf4a54c9a Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Mon, 23 Dec 2013 09:32:22 +0000 Subject: cluster/afr: avoid race due to afr_is_transaction_running() Problem: ------------------------------------------ afr_lookup_perform_self_heal() { if(afr_is_transaction_running()) goto out else afr_launch_self_heal(); } ------------------------------------------ When 2 clients simultaneously access a file in split-brain, one of them acquires the inode lock and proceeds with afr_launch_self_heal (which eventually fails and sets "sh-failed" in the callback.) The second client meanwhile bails out of afr_lookup_perform_self_heal() because afr_is_transaction_running() returns true due to the lock obtained by client-1. Consequetly in client-2, "sh-failed" does not get set in the dict, causing quick-read translator to *not* invalidate the inode, thereby serving data randomly from one of the bricks. Fix: If a possible split-brain is detected on lookup, forcefully traverse the afr_launch_self_heal() code path in afr_lookup_perform_self_heal(). Change-Id: I316f9f282543533fd3c958e4b63ecada42c2a14f BUG: 870565 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/6578 Reviewed-by: Pranith Kumar Karampuri Tested-by: Gluster Build System Reviewed-by: Varun Shastry --- xlators/cluster/afr/src/afr-common.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'xlators/cluster') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a4f97e950..250b0944e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1833,6 +1833,11 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { if (afr_is_transaction_running (local) && + /*Forcefully call afr_launch_self_heal (which will go on to + fail) for SB files.This prevents stale data being served + due to race in afr_is_transaction_running() when + multiple clients access the same SB file*/ + !local->cont.lookup.possible_spb && (!local->attempt_self_heal)) goto out; -- cgit