From 27ac070dc9612cfcd591464dbaa40ed52b84e23f Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Wed, 25 Jan 2017 15:31:44 +0530
Subject: cluster/ec: Don't trigger data/metadata heal on Lookups

Problem-1
If Lookup which doesn't take any locks observes version mismatch it can't be
trusted. If we launch a heal based on this information it will lead to
self-heals which will affect I/O performance in the cases where Lookup is
wrong. Considering self-heal-daemon and operations on the inode from client
which take locks can still trigger heal we can choose to not attempt a heal on
Lookup.

Problem-2:
Fixed spurious failure of
tests/bitrot/bug-1373520.t
For the issues above, what was happening was that ec_heal_inspect()
is preventing 'name' heal to happen

Problem-3:
tests/basic/ec/ec-background-heals.t
To be honest I don't know what the problem was, while fixing
the 2 problems above, I made some changes to ec_heal_inspect() and
ec_need_heal() after which when I tried to recreate the spurious
failure it just didn't happen even after a long time.

 >BUG: 1414287
 >Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
 >Change-Id: Ife2535e1d0b267712973673f6d474e288f3c6834
 >Reviewed-on: https://review.gluster.org/16468
 >Smoke: Gluster Build System <jenkins@build.gluster.org>
 >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
 >Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
 >CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
 >Reviewed-by: Ashish Pandey <aspandey@redhat.com>

BUG: 1419824
Change-Id: I340b48cd416b07890bf3a5427562f5e3f88a481f
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://review.gluster.org/16765
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Smoke: Gluster Build System <jenkins@build.gluster.org>
---
 tests/basic/ec/ec-background-heals.t |  2 -
 tests/basic/ec/heal-info.t           | 73 ++++++++++++++++++++++++++++++++++++
 tests/basic/ec/self-heal.t           | 21 +++++++----
 tests/bitrot/bug-1373520.t           | 35 ++---------------
 tests/volume.rc                      |  5 +++
 5 files changed, 95 insertions(+), 41 deletions(-)
 create mode 100644 tests/basic/ec/heal-info.t

(limited to 'tests')

diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t
index f0cabeab61c..eb434908bad 100644
--- a/tests/basic/ec/ec-background-heals.t
+++ b/tests/basic/ec/ec-background-heals.t
@@ -101,5 +101,3 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
 TEST chown root:root $M0/{a,b,c,d}
 EXPECT "0" mount_get_option_value $M0 $V0-disperse-0 heal-waiters
 cleanup
-#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696
-#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696
diff --git a/tests/basic/ec/heal-info.t b/tests/basic/ec/heal-info.t
new file mode 100644
index 00000000000..7393d22d222
--- /dev/null
+++ b/tests/basic/ec/heal-info.t
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks if heal info works as expected or not
+
+function create_files {
+        for i in {21..1000};
+        do
+                dd if=/dev/zero of=$M0/$i bs=1M count=1 2>&1 > /dev/null;
+        done
+        rm -f $M0/lock
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5}
+TEST $CLI volume set $V0 client-log-level DEBUG
+TEST $CLI volume heal $V0 disable
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0;
+# Wait until all 6 childs have been recognized by the ec xlator
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
+
+#heal info should give zero entries to be healed when I/O is going on
+dd if=/dev/zero of=$M0/a bs=1M count=2048 &
+dd_pid=$!
+sleep 3 #Wait for I/O to proceed for some time
+EXPECT "^0$" get_pending_heal_count $V0
+kill -9 $dd_pid
+touch $M0/lock
+create_files &
+
+total_heal_count=0
+while [ -f $M0/lock ];
+do
+        heal_count=$(get_pending_heal_count $V0)
+        total_heal_count=$((heal_count+total_heal_count))
+done
+EXPECT "^0$" echo $total_heal_count
+
+#When only data heal is required it should print it
+#There is no easy way to create this using commands so assigning xattrs directly
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}0/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}1/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}2/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}3/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}4/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000010000000000000000 $B0/${V0}5/1000
+index_path=$B0/${V0}5/.glusterfs/indices/xattrop/$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}5/1000))
+while [ -f $index_path ]; do :; done
+TEST touch $index_path
+EXPECT "^1$" get_pending_heal_count $V0
+TEST rm -f $M0/1000
+
+#When files/directories need heal test that it prints them
+TEST touch $M0/{1..10}
+TEST kill_brick $V0 $H0 $B0/${V0}0
+for i in {11..20};
+do
+        echo abc > $M0/$i #Data + entry + metadata heal
+done
+for i in {1..10};
+do
+        chmod +x $M0/$i;
+done
+
+EXPECT "^105$" get_pending_heal_count $V0
+
+cleanup
diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t
index 3e3467535fb..7f3486fe27b 100644
--- a/tests/basic/ec/self-heal.t
+++ b/tests/basic/ec/self-heal.t
@@ -9,6 +9,7 @@ cleanup
 
 function check_mount_dir
 {
+    getfattr -d -m. -e hex $M0 2>&1 > /dev/null
     for i in {1..20}; do
         ls -l $M0/ | grep "dir1"
         if [ $? -ne 0 ]; then
@@ -21,7 +22,7 @@ function check_mount_dir
 
 function check_size
 {
-    stat $M0/$1
+    cat $M0/$1 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         res=`stat -c "%s" $i/$1`
         if [ "$res" != "$2" ]; then
@@ -35,7 +36,7 @@ function check_size
 
 function check_mode
 {
-    stat $M0/$1
+    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         res=`stat -c "%A" $i/$1`
         if [ "$res" != "$2" ]; then
@@ -49,7 +50,7 @@ function check_mode
 
 function check_date
 {
-    stat $M0/$1
+    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         res=`stat -c "%Y" $i/$1`
         if [ "$res" != "$2" ]; then
@@ -63,7 +64,7 @@ function check_date
 
 function check_xattr
 {
-    stat $M0/$1
+    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         getfattr -n $2 $i/$1 2>/dev/null
         if [ $? -eq 0 ]; then
@@ -77,7 +78,7 @@ function check_xattr
 
 function check_dir
 {
-    getfattr -m. -d $M0/dir1
+    getfattr -m. -d $M0/dir1 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         if [ ! -d $i/dir1 ]; then
             echo "N"
@@ -90,7 +91,7 @@ function check_dir
 
 function check_soft_link
 {
-    stat $M0/test3
+    getfattr -d -m. -e hex $M0/test3 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         if [ ! -h $i/test3 ]; then
             echo "N"
@@ -103,7 +104,7 @@ function check_soft_link
 
 function check_hard_link
 {
-    stat $M0/test4
+    getfattr -d -m. -e hex $M0/test4 2>&1 > /dev/null
     for i in "${brick[@]}"; do
         res=`stat -c "%h" $i/test4`
         if [ "$res" != "3" ]; then
@@ -125,10 +126,14 @@ TESTS_EXPECTED_IN_LOOP=194
 TEST glusterd
 TEST pidof glusterd
 TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5}
+TEST $CLI volume set $V0 client-log-level DEBUG
+#Write-behind has a bug where lookup can race over write which leads to size mismatch on the mount after a 'cp'
+TEST $CLI volume set $V0 performance.write-behind off
 EXPECT "Created" volinfo_field $V0 'Status'
 TEST $CLI volume start $V0
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status'
-TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+#direct-io-mode is to make sure 'cat' leads to READ fop which triggers heal
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0;
 # Wait until all 6 childs have been recognized by the ec xlator
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
 
diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t
index 9362b9c516b..96e2891439f 100644
--- a/tests/bitrot/bug-1373520.t
+++ b/tests/bitrot/bug-1373520.t
@@ -49,37 +49,10 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
 #Delete file and all links from backend
 TEST rm -rf $(find $B0/${V0}5 -inum $(stat -c %i $B0/${V0}5/FILE1))
 
-# The test for each file below used to look like this:
-# 
-#   TEST stat $M0/FILE1
-#   EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" stat $B0/${V0}5/FILE1
-#
-# That didn't really work, because EXPECT_WITHIN would bail immediately if
-# 'stat' returned an error - which it would if the file wasn't there yet.
-# Since changing this, I usually see at least a few retries, and sometimes more
-# than twenty, before the check for HL_FILE1 succeeds.  The 'ls' is also
-# necessary, to force a name heal as well as data.  With both that and the
-# 'stat' on $M0 being done here for every retry, there's no longer any need to
-# have them elsewhere.
-#
-# If we had EW_RETRIES support (https://review.gluster.org/#/c/16451/) we could
-# use it here to see how many retries are typical on the machines we use for
-# regression, and set an appropriate upper bound.  As of right now, though,
-# that support does not exist yet.
-ugly_stat () {
-	local client_dir=$1
-	local brick_dir=$2
-	local bare_file=$3
-
-	ls $client_dir
-	stat -c %s $client_dir/$bare_file
-	stat -c %s $brick_dir/$bare_file 2> /dev/null || echo "UNKNOWN"
-}
-
 #Access files
-EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 FILE1
-EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 HL_FILE1
+TEST cat $M0/FILE1
+EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/FILE1
+TEST cat $M0/HL_FILE1
+EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/HL_FILE1
 
 cleanup;
-#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696
-#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696
diff --git a/tests/volume.rc b/tests/volume.rc
index e55a98253b8..9d85c035b15 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -519,6 +519,11 @@ function path_exists {
         if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
 }
 
+function path_size {
+        local size=$(stat -c %s $1)
+        if [ $? -eq 0 ]; then echo $size; else echo ""; fi
+}
+
 function force_umount {
         ${UMOUNT_F} $*
         if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
-- 
cgit