From 56caf4349c8824fde70783fe404cc6f646dce149 Mon Sep 17 00:00:00 2001 From: Xavier Hernandez Date: Wed, 8 Oct 2014 09:20:11 +0200 Subject: ec: Fix self-heal issues Problem: Doing an 'ls' of a directory that has been modified while one of the bricks was down, sometimes returns the old directory contents. Cause: Directories are not marked when they are modified as files are. The ec xlator balances requests amongst available and healthy bricks. Since there is no way to detect that a directory is out of date in one of the bricks, it is used from time to time to return the directory contents. Solution: Basically the solution consists in use versioning information also for directories, however some additional changes have been necessary. Changes: * Use directory versioning: This required to lock full directory instead of a single entry for all requests that add or remove entries from it. This is needed to allow atomic version update. This affects the following fops: create, mkdir, mknod, link, symlink, rename, unlink, rmdir Another side effect is that opendir requires to do a previous lookup to get versioning information and discard out of date bricks for subsequent readdir(p) calls. * Restrict directory self-heal: Till now, when one discrepancy was found in lookup, a self-heal was automatically started. This caused the versioning information of a bad directory to be healed instantly, making the original problem to reapear again. To solve this, when a missing directory is detected in one or more bricks on lookup or opendir fops, only a partial self-heal is performed on it. A partial self-heal basically creates the directory but does not restore any additional information. This avoids that an 'ls' could repair the directory and cause the problem to happen again. With this change, output of 'ls' is always consistent. However, since the directory has been created in the brick, this allows any other operation on it (create new files, for example) to succeed on all bricks and not add additional work to the self-heal process. To force a self-heal of a directory, any other operation must be done on it. For example a getxattr. With these changes, the correct healing procedure that would avoid inconsistent directory browsing consists on a post-order traversal of directoriesi being healed. This way, the directory contents will be healed before healing the directory itslef. * Additional changes to fix self-heal errors - Don't use fop->fd to decide between fd/loc. open, opendir and create have an fd, but the correct data is in loc. - Fix incorrect management of bad bricks per inode/fd. - Fix incorrect selection of fop's target bricks when there are bad bricks involved. - Improved ec_loc_parent() to always return a parent loc as complete as possible. Change-Id: Iaf3df174d7857da57d4a87b4a8740a7048b366ad BUG: 1149726 Signed-off-by: Xavier Hernandez Reviewed-on: http://review.gluster.org/8916 Reviewed-by: Dan Lambright Tested-by: Gluster Build System --- tests/basic/ec/ec.t | 59 ++++++++++++------ tests/basic/ec/self-heal.t | 151 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 166 insertions(+), 44 deletions(-) (limited to 'tests/basic/ec') diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t index c12aba3afe6..7b8a8568129 100644 --- a/tests/basic/ec/ec.t +++ b/tests/basic/ec/ec.t @@ -45,66 +45,97 @@ function check_truncate { } function check_hard_link { + stat $M0/hard-link-1 + stat $M0/hard-link-2 for b in $*; do inum1=$(ls -i $b/hard-link-1 | cut -d' ' -f1) inum2=$(ls -i $b/hard-link-2 | cut -d' ' -f1) - [ "$inum1" = "$inum2" ] || return 1 + if [ "$inum1" != "$inum2" ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_soft_link { + stat $M0/soft-link for b in $*; do - [ "$(readlink $b/soft-link)" = "soft-link-tgt" ] || return 1 + if [ "$(readlink $b/soft-link)" != "soft-link-tgt" ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_unlink { + stat $M0/unlink for b in $*; do - [ ! -e $b/unlink ] || return 1 + if [ -e $b/unlink ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_mkdir { + getfattr -m. -d $M0/mkdir for b in $*; do - [ -d $b/mkdir ] || return 1 + if [ ! -d $b/mkdir ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_rmdir { + getfattr -m. -d $M0/rmdir for b in $*; do - [ ! -e $b/rmdir ] || return 1 + if [ -e $b/rmdir ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_setxattr { + stat $M0/setxattr for b in $*; do v=$(my_getfattr -n user.foo $b/setxattr) - [ "$v" = "ash_nazg_durbatuluk" ] || return 1 + if [ "$v" != "ash_nazg_durbatuluk" ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_removexattr { + stat $M0/removexattr for b in $*; do my_getfattr -n user.bar $b/removexattr 2> /dev/null - [ $? = 0 ] && return 1 + if [ $? -eq 0 ]; then + echo "N" + return 0 + fi done echo "Y" return 0 } function check_perm_file { + stat $M0/perm_dir/perm_file + getfattr -m. -d $M0/perm_dir b1=$1 shift 1 ftext=$(stat -c "%u %g %a" $b1/perm_dir/perm_file) @@ -113,7 +144,8 @@ function check_perm_file { btext=$(stat -c "%u %g %a" $b/perm_dir/perm_file) #echo " next u/a/a = $btext" > /dev/tty if [ x"$btext" != x"$ftext" ]; then - return 1 + echo "N" + return 0 fi done echo "Y" @@ -210,17 +242,6 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "10" ec_child_up_count $V0 0 TEST check_create_write $M0 TEST check_truncate $M0 -TEST stat $M0/hard-link-1 -TEST stat $M0/hard-link-2 -TEST stat $M0/soft-link -TEST ! stat $M0/unlink -TEST ! stat $M0/rmdir -TEST stat $M0/mkdir -TEST stat $M0/setxattr -TEST stat $M0/removexattr -TEST stat $M0/perm_dir -TEST stat $M0/perm_dir/perm_file - EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_hard_link $B0/${V0}{0..9} EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_soft_link $B0/${V0}{0..9} EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_unlink $B0/${V0}{0..9} diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index d8a2f7988a0..ae7e4fe495a 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -7,12 +7,119 @@ cleanup +function check_mount_dir +{ + for i in {1..20}; do + ls | grep "dir1" + if [ $? -ne 0 ]; then + return 1 + fi + done + + return 0 +} + +function check_size +{ + stat $1 + for i in "${brick[@]}"; do + res=`stat -c "%s" $i/$1` + if [ "$res" != "$2" ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_mode +{ + stat $1 + for i in "${brick[@]}"; do + res=`stat -c "%A" $i/$1` + if [ "$res" != "$2" ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_date +{ + stat $1 + for i in "${brick[@]}"; do + res=`stat -c "%Y" $i/$1` + if [ "$res" != "$2" ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_xattr +{ + stat $1 + for i in "${brick[@]}"; do + getfattr -n $2 $i/$1 2>/dev/null + if [ $? -eq 0 ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_dir +{ + getfattr -m. -d dir1 + for i in "${brick[@]}"; do + if [ ! -d $i/dir1 ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_soft_link +{ + stat test3 + for i in "${brick[@]}"; do + if [ ! -h $i/test3 ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + +function check_hard_link +{ + stat test4 + for i in "${brick[@]}"; do + if [ `stat -c "%h" $i/test4` -ne 3 ]; then + echo "N" + return 0 + fi + done + echo "Y" + return 0 +} + tmp=`mktemp -d -t ${0##*/}.XXXXXX` if [ ! -d $tmp ]; then exit 1 fi -TESTS_EXPECTED_IN_LOOP=250 +TESTS_EXPECTED_IN_LOOP=194 TEST glusterd TEST pidof glusterd @@ -21,6 +128,7 @@ EXPECT "Created" volinfo_field $V0 'Status' TEST $CLI volume start $V0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +# Wait until all 6 childs have been recognized by the ec xlator EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024 @@ -46,12 +154,11 @@ for idx1 in {0..5}; do TEST chmod 666 ${brick[$idx1]}/test TEST truncate -s 0 ${brick[$idx1]}/test TEST setfattr -n user.test -v "test1" ${brick[$idx1]}/test - sleep 1 EXPECT "-rw-r--r--" stat -c "%A" test - EXPECT_WITHIN $HEAL_TIMEOUT "262144" stat -c "%s" ${brick[$idx1]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "946681200" stat -c "%Y" ${brick[$idx1]}/test - TEST ! getfattr -n user.test ${brick[$idx1]}/test + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_size test "262144" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_mode test "-rw-r--r--" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_date test "946681200" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_xattr test "user.test" done for idx1 in {0..4}; do @@ -63,16 +170,11 @@ for idx1 in {0..4}; do TEST truncate -s 2097152 ${brick[$idx2]}/test TEST setfattr -n user.test -v "test1" ${brick[$idx1]}/test TEST setfattr -n user.test -v "test2" ${brick[$idx2]}/test - sleep 1 EXPECT "-rw-r--r--" stat -c "%A" test - EXPECT_WITHIN $HEAL_TIMEOUT "262144" stat -c "%s" ${brick[$idx1]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "262144" stat -c "%s" ${brick[$idx2]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "-rw-r--r--" stat -c "%A" ${brick[$idx2]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "946681200" stat -c "%Y" ${brick[$idx1]}/test - EXPECT_WITHIN $HEAL_TIMEOUT "946681200" stat -c "%Y" ${brick[$idx2]}/test - TEST ! getfattr -n user.test ${brick[$idx1]}/test - TEST ! getfattr -n user.test ${brick[$idx2]}/test + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_size test "262144" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_mode test "-rw-r--r--" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_date test "946681200" + EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_xattr test "user.test" fi done done @@ -96,26 +198,25 @@ EXPECT "2" stat -c "%h" test2 EXPECT "2" stat -c "%h" test4 TEST $CLI volume start $V0 force +# Wait until the killed bricks have been started and recognized by the ec +# xlator EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +TEST check_mount_dir + EXPECT "1048576" stat -c "%s" test2 EXPECT "-rwxrwxrwx" stat -c "%A" test2 -EXPECT_WITHIN $HEAL_TIMEOUT "262144" stat -c "%s" ${brick[0]}/test2 -EXPECT_WITHIN $HEAL_TIMEOUT "262144" stat -c "%s" ${brick[1]}/test2 -EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[0]}/test2 -EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[1]}/test2 +EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_size test2 "262144" +EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_mode test2 "-rwxrwxrwx" TEST ls -al dir1 -EXPECT_WITHIN $HEAL_TIMEOUT "1" eval "if [ -d ${brick[0]}/dir1 ]; then echo 1; fi" -EXPECT_WITHIN $HEAL_TIMEOUT "1" eval "if [ -d ${brick[1]}/dir1 ]; then echo 1; fi" +EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_dir TEST [ -h test3 ] -EXPECT_WITHIN $HEAL_TIMEOUT "1" eval "if [ -h ${brick[0]}/test3 ]; then echo 1; fi" -EXPECT_WITHIN $HEAL_TIMEOUT "1" eval "if [ -h ${brick[1]}/test3 ]; then echo 1; fi" +EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_soft_link EXPECT "2" stat -c "%h" test4 -EXPECT_WITHIN $HEAL_TIMEOUT "3" stat -c "%h" ${brick[0]}/test4 -EXPECT_WITHIN $HEAL_TIMEOUT "3" stat -c "%h" ${brick[1]}/test4 +EXPECT_WITHIN $HEAL_TIMEOUT "Y" check_hard_link TEST rm -rf $tmp -- cgit