From d2650feb4bfadf3fb0cdb90236bc78c33b5ea451 Mon Sep 17 00:00:00 2001 From: Sunil Kumar Acharya Date: Wed, 5 Jul 2017 16:41:38 +0530 Subject: cluster/ec: Non-disruptive upgrade on EC volume fails Problem: Enabling optimistic changelog on EC volume was not handling node down scenarios appropriately resulting in volume data inaccessibility. Solution: Update dirty xattr appropriately on good bricks whenever nodes are down. This would fix the metadata information as part of heal and thus ensures data accessibility. BUG: 1468261 Change-Id: I08b0d28df386d9b2b49c3de84b4aac1c729ac057 Signed-off-by: Sunil Kumar Acharya Reviewed-on: https://review.gluster.org/17703 Smoke: Gluster Build System CentOS-regression: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- tests/basic/ec/ec-1468261.t | 96 ++++++++++++++++++++++++++++++++++++ tests/basic/ec/ec-background-heals.t | 1 + 2 files changed, 97 insertions(+) create mode 100644 tests/basic/ec/ec-1468261.t (limited to 'tests/basic') diff --git a/tests/basic/ec/ec-1468261.t b/tests/basic/ec/ec-1468261.t new file mode 100644 index 00000000000..9c4f981ab47 --- /dev/null +++ b/tests/basic/ec/ec-1468261.t @@ -0,0 +1,96 @@ +#!/bin/bash +# +# This test case verifies handling node down scenario with optimistic +# changelog enabled on EC volume. +### + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup + +#cleate and start volume +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 disperse.optimistic-change-log on +TEST $CLI volume start $V0 + +#Mount the volume +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#Verify that all is good +TEST mkdir $M0/test_dir +TEST touch $M0/test_dir/file +sleep 2 +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}0/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}1/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}2/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}3/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}4/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}5/test_dir + +#Touch a file and kill two bricks +pid0=`get_brick_pid $V0 $H0 $B0/${V0}0` +pid1=`get_brick_pid $V0 $H0 $B0/${V0}1` +TEST touch $M0/test_dir/new_file +kill $pid0 +kill $pid1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0 + +#Dirty should be set on up bricks +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}0/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^$" get_hex_xattr trusted.ec.dirty $B0/${V0}1/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^00000000000000010000000000000001$" get_hex_xattr trusted.ec.dirty $B0/${V0}2/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^00000000000000010000000000000001$" get_hex_xattr trusted.ec.dirty $B0/${V0}3/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^00000000000000010000000000000001$" get_hex_xattr trusted.ec.dirty $B0/${V0}4/test_dir +EXPECT_WITHIN $IO_WAIT_TIMEOUT "^00000000000000010000000000000001$" get_hex_xattr trusted.ec.dirty $B0/${V0}5/test_dir + +#Bring up the down bricks +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#remove mount point contents +TEST rm -rf $M0"/*" 2>/dev/null + +# unmount and remount the volume +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +TEST glusterfs -s $H0 --volfile-id $V0 $M0; + +#Create a tar file +TEST mkdir $M0/test_dir +for i in {1..3000};do +dd if=/dev/urandom of=$M0/test_dir/file-$i bs=1k count=10; +done +tar -cf $M0/test_dir.tar $M0/test_dir/ 2>/dev/null +rm -rf $M0/test_dir/ + +#Untar the tar file +tar -C $M0 -xf $M0/test_dir.tar 2>/dev/null& + +#Kill 1st and 2nd brick +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0 + +#Stop untaring +TEST kill %1 + +#Bring up the down bricks +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#Wait for heal to complete +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +#Kill 3rd and 4th brick +TEST kill_brick $V0 $H0 $B0/${V0}3 +TEST kill_brick $V0 $H0 $B0/${V0}4 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0 + +#remove mount point contents +#this will fail if things are wrong +TEST rm -rf $M0"/*" 2>/dev/null + +cleanup diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t index eb434908bad..b9291bc9c32 100644 --- a/tests/basic/ec/ec-background-heals.t +++ b/tests/basic/ec/ec-background-heals.t @@ -16,6 +16,7 @@ TEST $CLI volume set $V0 performance.quick-read off TEST $CLI volume set $V0 performance.read-ahead off TEST $CLI volume set $V0 performance.io-cache off TEST $CLI volume set $V0 disperse.background-heals 0 +TEST $CLI volume set $V0 disperse.eager-lock off TEST $CLI volume start $V0 TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; -- cgit