cluster/afr: Fix missing name indices due to EEXIST error

Backport of: http://review.gluster.org/16286 PROBLEM: Consider a volume with granular-entry-heal and sharding enabled. When a replica is down and a shard is created as part of a write, the name index is correctly created under indices/entry-changes/<dot-shard-gfid>. Now when a read on the same region triggers another MKNOD, the fop fails on the online bricks with EEXIST. By virtue of this being a symmetric error, the failed_subvols[] array is reset to all zeroes. Because of this, before post-op, the GF_XATTROP_ENTRY_OUT_KEY will be set, causing the name index, which was created in the previous MKNOD operation, to be wrongly deleted in THIS MKNOD operation. FIX: The ideal fix would have been for a transaction to delete the name index ONLY if it knows it is the one that created the index in the first place. This would involve gathering information as to whether THIS xattrop created the index from individual bricks, aggregating their responses and based on the various posisble combinations of responses, decide whether to delete the index or not. This is rather complex. Simpler fix would be for post-op to examine local->op_ret in the event of no failed_subvols to figure out whether to delete the name index or not. This can occasionally lead to creation of stale name indices but they won't be affecting the IO path or mess with pending changelogs in any way and self-heal in its crawl of "entry-changes" directory would take care to delete such indices. Change-Id: I8c5c08b7a208e840b5970fe5699dabdaf751a150 BUG: 1408785 Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-on: http://review.gluster.org/16294 Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
author: Krutika Dhananjay <kdhananj@redhat.com> 2016-12-26 21:08:03 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2016-12-28 01:06:48 -0800
commit: 544f6ce9e7a249360166e98dd7df1b09f91717a9 (patch)
tree: f39a88e0654fead6f91bbfa6873242cf4bf9795e
parent: 97e1e560f3124b31ad77cfb7534f4602a27e097f (diff)
3 files changed, 98 insertions, 0 deletions
diff --git a/tests/bugs/replicate/bug-1408712.t b/tests/bugs/replicate/bug-1408712.t
new file mode 100644
index 00000000000..b26e8a06923
--- /dev/null
+++ b/tests/bugs/replicate/bug-1408712.t
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../afr.rc
+
+cleanup
+
+TESTS_EXPECTED_IN_LOOP=12
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume heal $V0 granular-entry-heal enable
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 performance.flush-behind off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M1
+
+cd $M0
+TEST dd if=/dev/zero of=file bs=1M count=8
+
+# Kill brick-0.
+TEST kill_brick $V0 $H0 $B0/${V0}0
+
+TEST "dd if=/dev/zero bs=1M count=8 >> file"
+
+FILE_GFID=$(get_gfid_string $M0/file)
+
+# Test that the index associated with '/.shard' is created on B1 and B2.
+TEST stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+TEST stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+# Check for successful creation of granular entry indices
+for i in {2..3}
+do
+        TEST_IN_LOOP stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+        TEST_IN_LOOP stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+done
+
+cd ~
+TEST md5sum $M1/file
+
+# Test that the index associated with '/.shard' and the created shards do not disappear on B1 and B2.
+TEST stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+TEST stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+for i in {2..3}
+do
+        TEST_IN_LOOP stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+        TEST_IN_LOOP stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+done
+
+# Start the brick that was down
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+
+# Enable shd
+TEST gluster volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+
+TEST $CLI volume heal $V0
+
+# Wait for heal to complete
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+# Now verify that there are no name indices left after self-heal
+TEST ! stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+TEST ! stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID
+
+for i in {2..3}
+do
+        TEST_IN_LOOP ! stat $B0/${V0}1/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+        TEST_IN_LOOP ! stat $B0/${V0}2/.glusterfs/indices/entry-changes/$DOT_SHARD_GFID/$FILE_GFID.$i
+done
+
+cleanup
diff --git a/tests/include.rc b/tests/include.rc
index cdd1a1fa132..b9e27063196 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -11,6 +11,7 @@ B0=${B0:=/d/backends};        # top level of brick directories
 WORKDIRS="$B0 $M0 $M1 $M2 $N0 $N1" 
 
 ROOT_GFID="00000000-0000-0000-0000-000000000001"
+DOT_SHARD_GFID="be318638-e8a0-4c6d-977d-7a937aa84806"
 
 META_VOL=${META_VOL:=gluster_shared_storage}; # shared gluster storage volume used by snapshot scheduler, nfs ganesha and geo-rep.
 META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared gluster volume.
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 4651c27ce5c..a728b08070a 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1207,6 +1207,16 @@ afr_changelog_populate_xdata (call_frame_t *frame, afr_xattrop_type_t op,
                                 need_entry_key_set = _gf_false;
                                 break;
                         }
+                        /* If the transaction itself did not fail and there
+                         * are no failed subvolumes, check whether the fop
+                         * failed due to a symmetric error. If it did, do
+                         * not set the ENTRY_OUT xattr which would end up
+                         * deleting a name index which was created possibly by
+                         * an earlier entry txn that may have failed on some
+                         * of the sub-volumes.
+                         */
+                        if (local->op_ret)
+                                need_entry_key_set = _gf_false;
                 } else {
                         key = GF_XATTROP_ENTRY_IN_KEY;
                 }
author	Krutika Dhananjay <kdhananj@redhat.com>	2016-12-26 21:08:03 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2016-12-28 01:06:48 -0800
commit	544f6ce9e7a249360166e98dd7df1b09f91717a9 (patch)
tree	f39a88e0654fead6f91bbfa6873242cf4bf9795e
parent	97e1e560f3124b31ad77cfb7534f4602a27e097f (diff)