From 27ac070dc9612cfcd591464dbaa40ed52b84e23f Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Wed, 25 Jan 2017 15:31:44 +0530 Subject: cluster/ec: Don't trigger data/metadata heal on Lookups Problem-1 If Lookup which doesn't take any locks observes version mismatch it can't be trusted. If we launch a heal based on this information it will lead to self-heals which will affect I/O performance in the cases where Lookup is wrong. Considering self-heal-daemon and operations on the inode from client which take locks can still trigger heal we can choose to not attempt a heal on Lookup. Problem-2: Fixed spurious failure of tests/bitrot/bug-1373520.t For the issues above, what was happening was that ec_heal_inspect() is preventing 'name' heal to happen Problem-3: tests/basic/ec/ec-background-heals.t To be honest I don't know what the problem was, while fixing the 2 problems above, I made some changes to ec_heal_inspect() and ec_need_heal() after which when I tried to recreate the spurious failure it just didn't happen even after a long time. >BUG: 1414287 >Signed-off-by: Pranith Kumar K >Change-Id: Ife2535e1d0b267712973673f6d474e288f3c6834 >Reviewed-on: https://review.gluster.org/16468 >Smoke: Gluster Build System >NetBSD-regression: NetBSD Build System >Reviewed-by: Xavier Hernandez >CentOS-regression: Gluster Build System >Reviewed-by: Ashish Pandey BUG: 1419824 Change-Id: I340b48cd416b07890bf3a5427562f5e3f88a481f Signed-off-by: Pranith Kumar K Reviewed-on: https://review.gluster.org/16765 NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Xavier Hernandez Smoke: Gluster Build System --- tests/basic/ec/ec-background-heals.t | 2 - tests/basic/ec/heal-info.t | 73 ++++++++++++++++++++++++++++++++++++ tests/basic/ec/self-heal.t | 21 +++++++---- tests/bitrot/bug-1373520.t | 35 ++--------------- tests/volume.rc | 5 +++ 5 files changed, 95 insertions(+), 41 deletions(-) create mode 100644 tests/basic/ec/heal-info.t (limited to 'tests') diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t index f0cabeab61c..eb434908bad 100644 --- a/tests/basic/ec/ec-background-heals.t +++ b/tests/basic/ec/ec-background-heals.t @@ -101,5 +101,3 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 TEST chown root:root $M0/{a,b,c,d} EXPECT "0" mount_get_option_value $M0 $V0-disperse-0 heal-waiters cleanup -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696 diff --git a/tests/basic/ec/heal-info.t b/tests/basic/ec/heal-info.t new file mode 100644 index 00000000000..7393d22d222 --- /dev/null +++ b/tests/basic/ec/heal-info.t @@ -0,0 +1,73 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks if heal info works as expected or not + +function create_files { + for i in {21..1000}; + do + dd if=/dev/zero of=$M0/$i bs=1M count=1 2>&1 > /dev/null; + done + rm -f $M0/lock +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +TEST $CLI volume heal $V0 disable +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0; +# Wait until all 6 childs have been recognized by the ec xlator +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#heal info should give zero entries to be healed when I/O is going on +dd if=/dev/zero of=$M0/a bs=1M count=2048 & +dd_pid=$! +sleep 3 #Wait for I/O to proceed for some time +EXPECT "^0$" get_pending_heal_count $V0 +kill -9 $dd_pid +touch $M0/lock +create_files & + +total_heal_count=0 +while [ -f $M0/lock ]; +do + heal_count=$(get_pending_heal_count $V0) + total_heal_count=$((heal_count+total_heal_count)) +done +EXPECT "^0$" echo $total_heal_count + +#When only data heal is required it should print it +#There is no easy way to create this using commands so assigning xattrs directly +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}0/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}1/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}2/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}3/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}4/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000010000000000000000 $B0/${V0}5/1000 +index_path=$B0/${V0}5/.glusterfs/indices/xattrop/$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}5/1000)) +while [ -f $index_path ]; do :; done +TEST touch $index_path +EXPECT "^1$" get_pending_heal_count $V0 +TEST rm -f $M0/1000 + +#When files/directories need heal test that it prints them +TEST touch $M0/{1..10} +TEST kill_brick $V0 $H0 $B0/${V0}0 +for i in {11..20}; +do + echo abc > $M0/$i #Data + entry + metadata heal +done +for i in {1..10}; +do + chmod +x $M0/$i; +done + +EXPECT "^105$" get_pending_heal_count $V0 + +cleanup diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 3e3467535fb..7f3486fe27b 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -9,6 +9,7 @@ cleanup function check_mount_dir { + getfattr -d -m. -e hex $M0 2>&1 > /dev/null for i in {1..20}; do ls -l $M0/ | grep "dir1" if [ $? -ne 0 ]; then @@ -21,7 +22,7 @@ function check_mount_dir function check_size { - stat $M0/$1 + cat $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%s" $i/$1` if [ "$res" != "$2" ]; then @@ -35,7 +36,7 @@ function check_size function check_mode { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%A" $i/$1` if [ "$res" != "$2" ]; then @@ -49,7 +50,7 @@ function check_mode function check_date { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%Y" $i/$1` if [ "$res" != "$2" ]; then @@ -63,7 +64,7 @@ function check_date function check_xattr { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do getfattr -n $2 $i/$1 2>/dev/null if [ $? -eq 0 ]; then @@ -77,7 +78,7 @@ function check_xattr function check_dir { - getfattr -m. -d $M0/dir1 + getfattr -m. -d $M0/dir1 2>&1 > /dev/null for i in "${brick[@]}"; do if [ ! -d $i/dir1 ]; then echo "N" @@ -90,7 +91,7 @@ function check_dir function check_soft_link { - stat $M0/test3 + getfattr -d -m. -e hex $M0/test3 2>&1 > /dev/null for i in "${brick[@]}"; do if [ ! -h $i/test3 ]; then echo "N" @@ -103,7 +104,7 @@ function check_soft_link function check_hard_link { - stat $M0/test4 + getfattr -d -m. -e hex $M0/test4 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%h" $i/test4` if [ "$res" != "3" ]; then @@ -125,10 +126,14 @@ TESTS_EXPECTED_IN_LOOP=194 TEST glusterd TEST pidof glusterd TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +#Write-behind has a bug where lookup can race over write which leads to size mismatch on the mount after a 'cp' +TEST $CLI volume set $V0 performance.write-behind off EXPECT "Created" volinfo_field $V0 'Status' TEST $CLI volume start $V0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' -TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +#direct-io-mode is to make sure 'cat' leads to READ fop which triggers heal +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0; # Wait until all 6 childs have been recognized by the ec xlator EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t index 9362b9c516b..96e2891439f 100644 --- a/tests/bitrot/bug-1373520.t +++ b/tests/bitrot/bug-1373520.t @@ -49,37 +49,10 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count #Delete file and all links from backend TEST rm -rf $(find $B0/${V0}5 -inum $(stat -c %i $B0/${V0}5/FILE1)) -# The test for each file below used to look like this: -# -# TEST stat $M0/FILE1 -# EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" stat $B0/${V0}5/FILE1 -# -# That didn't really work, because EXPECT_WITHIN would bail immediately if -# 'stat' returned an error - which it would if the file wasn't there yet. -# Since changing this, I usually see at least a few retries, and sometimes more -# than twenty, before the check for HL_FILE1 succeeds. The 'ls' is also -# necessary, to force a name heal as well as data. With both that and the -# 'stat' on $M0 being done here for every retry, there's no longer any need to -# have them elsewhere. -# -# If we had EW_RETRIES support (https://review.gluster.org/#/c/16451/) we could -# use it here to see how many retries are typical on the machines we use for -# regression, and set an appropriate upper bound. As of right now, though, -# that support does not exist yet. -ugly_stat () { - local client_dir=$1 - local brick_dir=$2 - local bare_file=$3 - - ls $client_dir - stat -c %s $client_dir/$bare_file - stat -c %s $brick_dir/$bare_file 2> /dev/null || echo "UNKNOWN" -} - #Access files -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 FILE1 -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 HL_FILE1 +TEST cat $M0/FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/FILE1 +TEST cat $M0/HL_FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/HL_FILE1 cleanup; -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696 diff --git a/tests/volume.rc b/tests/volume.rc index e55a98253b8..9d85c035b15 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -519,6 +519,11 @@ function path_exists { if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi } +function path_size { + local size=$(stat -c %s $1) + if [ $? -eq 0 ]; then echo $size; else echo ""; fi +} + function force_umount { ${UMOUNT_F} $* if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi -- cgit