diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2018-10-10 12:18:55 +0530 | 
|---|---|---|
| committer | Amar Tumballi <amarts@redhat.com> | 2018-10-10 16:18:55 +0000 | 
| commit | 196b32423af9a5088056fe10ea22d01494670931 (patch) | |
| tree | 4760872562d1e14d721dd6627378a204867f1be7 | |
| parent | 4da0d93f8703c7247fece9d9fb471742e6fd7c33 (diff) | |
afr: prevent winding inodelks twice for arbiter volumes
Problem:
In an arbiter volume, if there is a pending data heal of a file only on
arbiter brick, self-heal takes inodelks twice due to a code-bug but unlocks
it only once, leaving behind a stale lock on the brick. This causes
the next write to the file to hang.
Fix:
Fix the code-bug to take lock only once. This bug was introduced master
with commit eb472d82a083883335bc494b87ea175ac43471ff
Thanks to  Pranith Kumar K <pkarampu@redhat.com> for finding the RCA.
fixes: bz#1637802
Change-Id: I15ad969e10a6a3c4bd255e2948b6be6dcddc61e1
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
| -rw-r--r-- | tests/bugs/replicate/bug-1637802-arbiter-stale-data-heal-lock.t | 44 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 2 | 
2 files changed, 45 insertions, 1 deletions
| diff --git a/tests/bugs/replicate/bug-1637802-arbiter-stale-data-heal-lock.t b/tests/bugs/replicate/bug-1637802-arbiter-stale-data-heal-lock.t new file mode 100644 index 00000000000..91ed39beb96 --- /dev/null +++ b/tests/bugs/replicate/bug-1637802-arbiter-stale-data-heal-lock.t @@ -0,0 +1,44 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../afr.rc + +cleanup; + +# Test to check that data self-heal does not leave any stale lock. + +TEST glusterd; +TEST pidof glusterd; +TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0,1,2}; +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +# Create base entry in indices/xattrop +echo "Data" > $M0/FILE + +# Kill arbiter brick and write to FILE. +TEST kill_brick $V0 $H0 $B0/${V0}2 +echo "arbiter down" >> $M0/FILE +EXPECT 2 get_pending_heal_count $V0 + +# Bring it back up and let heal complete. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +# write to the FILE must succeed. +echo "this must succeed" >> $M0/FILE +TEST [ $? -eq 0 ] +cleanup; diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a3650a3b22d..8cd79c73aa4 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -731,7 +731,7 @@ restore_time:      afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,                                locked_replies); -    if (!is_arbiter_the_only_sink || !empty_file) { +    if (!is_arbiter_the_only_sink && !empty_file) {          ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,                                     data_lock);          if (ret < priv->child_count) { | 
