From 6f08b9f2b006a4eafaa176cfd792038eed7f6c98 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 25 May 2016 21:18:19 +0530 Subject: afr: Automagic unsplit-brain by [ctime|mtime|size|majority] Backport of http://review.gluster.org/#/c/14026/ Introduce cluster.favorite-child-policy which when enabled with [ctime|mtime|size|majority], automatically heals files that are in split-brian. The majority policy will not pick a source if there is no majority. The other three policies pick the first brick with a valid reply and non-zero ctime/mtime/size as source. Change-Id: I93623a914dce2839957fce87b514050e9d274d4c BUG: 1339639 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/14535 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- .../basic/afr/split-brain-favorite-child-policy.t | 175 +++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tests/basic/afr/split-brain-favorite-child-policy.t (limited to 'tests') diff --git a/tests/basic/afr/split-brain-favorite-child-policy.t b/tests/basic/afr/split-brain-favorite-child-policy.t new file mode 100644 index 00000000000..66fcd67a031 --- /dev/null +++ b/tests/basic/afr/split-brain-favorite-child-policy.t @@ -0,0 +1,175 @@ +#!/bin/bash + +#Test the split-brain resolution CLI commands. +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd + +#Create replica 2 volume +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +TEST touch $M0/file + +############ Healing using favorite-child-policy = ctime ################# +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the first brick has latest ctime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy ctime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = mtime ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file still in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has latest mtime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = size ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has the bigger size file. +BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy size +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = majority on replica-3 ################# + +#Convert volume to replica-3 +TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second and third bricks agree with each other. Pick any one of them. +MAJORITY_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$MAJORITY_MD5" == "$HEALED_MD5" ] + +TEST force_umount $M0 +cleanup -- cgit