summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2016-05-02 18:45:44 +0530
committerJeff Darcy <jdarcy@redhat.com>2016-05-25 11:55:08 -0700
commit2f29065ae4715c9c4a9d20c4d15311bebd3ddb0e (patch)
tree163497880075bfcc2b97f92be2d2fb86ed95ab51
parentf8f16595d8dd8c8a869630bb77b7fd1b42b97e08 (diff)
afr: Automagic unsplit-brain by [ctime|mtime|size|majority]
Introduce cluster.favorite-child-policy which when enabled with [ctime|mtime|size|majority], automatically heals files that are in split-brian. The majority policy will not pick a source if there is no majority. The other three policies pick the first brick with a valid reply and non-zero ctime/mtime/size as source. Change-Id: I3c099a0404082213860f74f2c9b4d207cfaedb76 BUG: 1328224 Original-author: Richard Wareing <rwareing@fb.com> Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/14026 Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anuradha Talur <atalur@redhat.com> Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
-rw-r--r--tests/basic/afr/split-brain-favorite-child-policy.t175
-rw-r--r--xlators/cluster/afr/src/afr-messages.h10
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c292
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c8
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c8
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h1
-rw-r--r--xlators/cluster/afr/src/afr.c46
-rw-r--r--xlators/cluster/afr/src/afr.h13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
9 files changed, 537 insertions, 22 deletions
diff --git a/tests/basic/afr/split-brain-favorite-child-policy.t b/tests/basic/afr/split-brain-favorite-child-policy.t
new file mode 100644
index 00000000000..66fcd67a031
--- /dev/null
+++ b/tests/basic/afr/split-brain-favorite-child-policy.t
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+#Test the split-brain resolution CLI commands.
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+#Create replica 2 volume
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+TEST touch $M0/file
+
+############ Healing using favorite-child-policy = ctime #################
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+
+#file fill in split-brain
+cat $M0/file > /dev/null
+EXPECT "1" echo $?
+
+#We know that the first brick has latest ctime.
+LATEST_CTIME_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
+TEST $CLI volume set $V0 cluster.favorite-child-policy ctime
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+cat $M0/file > /dev/null
+EXPECT "0" echo $?
+HEALED_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
+TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ]
+
+############ Healing using favorite-child-policy = mtime #################
+TEST $CLI volume set $V0 cluster.favorite-child-policy none
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+
+#file still in split-brain
+cat $M0/file > /dev/null
+EXPECT "1" echo $?
+
+#We know that the second brick has latest mtime.
+LATEST_CTIME_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
+TEST $CLI volume set $V0 cluster.favorite-child-policy mtime
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+cat $M0/file > /dev/null
+EXPECT "0" echo $?
+HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
+TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ]
+
+############ Healing using favorite-child-policy = size #################
+TEST $CLI volume set $V0 cluster.favorite-child-policy none
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+
+#file fill in split-brain
+cat $M0/file > /dev/null
+EXPECT "1" echo $?
+
+#We know that the second brick has the bigger size file.
+BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
+TEST $CLI volume set $V0 cluster.favorite-child-policy size
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+cat $M0/file > /dev/null
+EXPECT "0" echo $?
+HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
+TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5" ]
+
+############ Healing using favorite-child-policy = majority on replica-3 #################
+
+#Convert volume to replica-3
+TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+TEST $CLI volume set $V0 cluster.quorum-type none
+TEST $CLI volume set $V0 cluster.favorite-child-policy none
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+TEST $CLI volume heal $V0
+
+#file fill in split-brain
+cat $M0/file > /dev/null
+EXPECT "1" echo $?
+
+#We know that the second and third bricks agree with each other. Pick any one of them.
+MAJORITY_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+cat $M0/file > /dev/null
+EXPECT "0" echo $?
+HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
+TEST [ "$MAJORITY_MD5" == "$HEALED_MD5" ]
+
+TEST force_umount $M0
+cleanup
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index fac37b8c34b..c7af18d0f25 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -40,7 +40,7 @@
*/
#define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR
-#define GLFS_NUM_MESSAGES 41
+#define GLFS_NUM_MESSAGES 42
#define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
#define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
@@ -361,5 +361,13 @@
*/
#define AFR_MSG_TIMER_CREATE_FAIL (GLFS_COMP_BASE_AFR + 41)
+/*!
+ * @messageid 108042
+ * @diagnosis Log messages relating to automated resolution of split-brain files
+ * based on favorite child policies.
+ * @recommendedaction
+*/
+#define AFR_MSG_SBRAIN_FAV_CHILD_POLICY (GLFS_COMP_BASE_AFR + 42)
+
#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 0b92f616030..a4c0e89e434 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -466,28 +466,20 @@ afr_dict_contains_heal_op (call_frame_t *frame)
return _gf_true;
}
-/* Return a source depending on the type of heal_op, and set sources[source],
- * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
- * only if the following condition is met:
- * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
- * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
- * sinks[node] are 1. This should be the case if the file is in split-brain.
- */
int
-afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
- unsigned char *sources,
+afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
+ xlator_t *this, unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies,
- afr_transaction_type type)
+ afr_transaction_type type, int heal_op)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
dict_t *xdata_req = NULL;
dict_t *xdata_rsp = NULL;
int ret = 0;
- int heal_op = -1;
int i = 0;
char *name = NULL;
int source = -1;
@@ -496,10 +488,6 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
priv = this->private;
xdata_req = local->xdata_req;
- ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
- if (ret)
- goto out;
-
for (i = 0; i < priv->child_count; i++) {
if (locked_on[i])
if (sources[i] || !sinks[i] || !healed_sinks[i]) {
@@ -598,6 +586,280 @@ out:
}
+int
+afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,
+ inode_t *inode)
+{
+ afr_private_t *priv;
+ int vote_count = -1;
+ int fav_child = -1;
+ int i = 0;
+ int k = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug (this->name, 0, "Child:%s "
+ "mtime_sec = %d, size = %lu for gfid %s",
+ priv->children[i]->name,
+ replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_size,
+ uuid_utoa (inode->gfid));
+ vote_count = 0;
+ for (k = 0; k < priv->child_count; k++) {
+ if ((replies[k].poststat.ia_mtime ==
+ replies[i].poststat.ia_mtime) &&
+ (replies[k].poststat.ia_size ==
+ replies[i].poststat.ia_size)
+ ) {
+ vote_count++;
+ }
+ }
+ if (vote_count > priv->child_count/2) {
+ fav_child = i;
+ break;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
+int
+afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_mtime = 0;
+ uint32_t cmp_mtime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug (this->name, 0, "Child:%s "
+ "mtime = %d, mtime_nsec = %d for gfid %s",
+ priv->children[i]->name,
+ replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_mtime_nsec,
+ uuid_utoa (inode->gfid));
+ if (replies[i].poststat.ia_mtime > cmp_mtime) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec =
+ replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_mtime == cmp_mtime)
+ && (replies[i].poststat.ia_mtime_nsec >
+ cmp_mtime_nsec)) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec =
+ replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
+int
+afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_ctime = 0;
+ uint32_t cmp_ctime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug (this->name, 0, "Child:%s "
+ "ctime = %d, ctime_nsec = %d for gfid %s",
+ priv->children[i]->name,
+ replies[i].poststat.ia_ctime,
+ replies[i].poststat.ia_ctime_nsec,
+ uuid_utoa (inode->gfid));
+ if (replies[i].poststat.ia_ctime > cmp_ctime) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec =
+ replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_ctime == cmp_ctime)
+ && (replies[i].poststat.ia_ctime_nsec >
+ cmp_ctime_nsec)) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec =
+ replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size.
+ */
+int
+afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint64_t cmp_sz = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug (this->name, 0, "Child:%s "
+ "file size = %lu for gfid %s",
+ priv->children[i]->name,
+ replies[i].poststat.ia_size,
+ uuid_utoa (inode->gfid));
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
+
+
+int
+afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
+ xlator_t *this,
+ inode_t *inode,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+ char mtime_str[256];
+ char ctime_str[256];
+ char *policy_str = NULL;
+ struct tm *tm_ptr;
+ time_t time;
+
+ priv = this->private;
+ if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MAJORITY) {
+ fav_child = afr_sh_fav_by_majority (this, replies, inode);
+ if (fav_child >= 0)
+ policy_str = "MAJORITY";
+ } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MTIME) {
+ fav_child = afr_sh_fav_by_mtime (this, replies, inode);
+ if (fav_child >= 0)
+ policy_str = "MTIME";
+ } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_CTIME) {
+ fav_child = afr_sh_fav_by_ctime (this, replies, inode);
+ if (fav_child >= 0)
+ policy_str = "CTIME";
+ } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_SIZE) {
+ fav_child = afr_sh_fav_by_size (this, replies, inode);
+ if (fav_child >= 0)
+ policy_str = "SIZE";
+ }
+
+ if (fav_child > priv->child_count - 1) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) "
+ "selected by policy %s.", fav_child, policy_str);
+ } else if (fav_child >= 0) {
+ time = replies[fav_child].poststat.ia_mtime;
+ tm_ptr = localtime (&time);
+ strftime (mtime_str, sizeof (mtime_str), "%Y-%m-%d %H:%M:%S",
+ tm_ptr);
+ time = replies[fav_child].poststat.ia_ctime;
+ tm_ptr = localtime (&time);
+ strftime (ctime_str, sizeof (ctime_str), "%Y-%m-%d %H:%M:%S",
+ tm_ptr);
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Source %s "
+ "selected as authentic to resolve conflicting "
+ "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, "
+ "%s ctime).",
+ priv->children[fav_child]->name,
+ uuid_utoa (inode->gfid),
+ policy_str,
+ replies[fav_child].poststat.ia_size,
+ mtime_str,
+ ctime_str);
+
+ sources[fav_child] = 1;
+ sinks[fav_child] = 0;
+ healed_sinks[fav_child] = 0;
+ }
+ return fav_child;
+}
+
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
+ */
+int
+afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ int heal_op = -1;
+ int ret = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto autoheal;
+
+ ret = afr_mark_split_brain_source_sinks_by_heal_op (frame, this,
+ sources, sinks,
+ healed_sinks,
+ locked_on, replies,
+ type, heal_op);
+ return ret;
+
+autoheal:
+ /* Automatically heal if fav_child_policy is set. */
+ if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+ ret = afr_mark_split_brain_source_sinks_by_policy (frame, this,
+ inode,
+ sources,
+ sinks,
+ healed_sinks,
+ locked_on,
+ replies,
+ type);
+ }
+
+ return ret;
+}
+
gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness)
{
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index f4cd16c3a70..2a33e53764c 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -568,6 +568,7 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,
static int
__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
@@ -585,7 +586,7 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
/* split brain */
- source = afr_mark_split_brain_source_sinks (frame, this,
+ source = afr_mark_split_brain_source_sinks (frame, this, inode,
sources, sinks,
healed_sinks,
locked_on, replies,
@@ -663,8 +664,9 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
*/
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
- source = __afr_selfheal_data_finalize_source (frame, this, sources,
- sinks, healed_sinks,
+ source = __afr_selfheal_data_finalize_source (frame, this, inode,
+ sources, sinks,
+ healed_sinks,
locked_on, replies,
witness);
if (source < 0)
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 25d8b98adda..130a3daa203 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -196,6 +196,7 @@ out:
static int
__afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
@@ -215,7 +216,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
- source = afr_mark_split_brain_source_sinks (frame, this,
+ source = afr_mark_split_brain_source_sinks (frame, this, inode,
sources, sinks,
healed_sinks,
locked_on, replies,
@@ -352,8 +353,9 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
}
}
- source = __afr_selfheal_metadata_finalize_source (frame, this, sources,
- sinks, healed_sinks,
+ source = __afr_selfheal_metadata_finalize_source (frame, this, inode,
+ sources, sinks,
+ healed_sinks,
locked_on, replies);
if (source < 0)
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index be787683c19..ec5337e60b2 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -220,6 +220,7 @@ afr_dict_contains_heal_op (call_frame_t *frame);
int
afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index d01a806fe86..f2cb7dd7fb2 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -20,6 +20,15 @@
struct volume_options options[];
+static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
+ [AFR_FAV_CHILD_NONE] = "none",
+ [AFR_FAV_CHILD_BY_SIZE] = "size",
+ [AFR_FAV_CHILD_BY_CTIME] = "ctime",
+ [AFR_FAV_CHILD_BY_MTIME] = "mtime",
+ [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
+ [AFR_FAV_CHILD_POLICY_MAX] = NULL,
+};
+
int32_t
notify (xlator_t *this, int32_t event,
void *data, ...)
@@ -101,6 +110,19 @@ fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype,
}
int
+afr_set_favorite_child_policy (afr_private_t *priv, char *policy)
+{
+ int index = -1;
+
+ index = gf_get_index_by_elem (afr_favorite_child_policies, policy);
+ if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
+ return -1;
+
+ priv->fav_child_policy = index;
+
+ return 0;
+}
+int
reconfigure (xlator_t *this, dict_t *options)
{
afr_private_t *priv = NULL;
@@ -109,6 +131,7 @@ reconfigure (xlator_t *this, dict_t *options)
int ret = -1;
int index = -1;
char *qtype = NULL;
+ char *fav_child_policy = NULL;
priv = this->private;
@@ -228,6 +251,11 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("shd-wait-qlength", priv->shd.wait_qlength,
options, uint32, out);
+ GF_OPTION_RECONF ("favorite-child-policy", fav_child_policy, options,
+ str, out);
+ if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1)
+ goto out;
+
priv->did_discovery = _gf_false;
ret = 0;
@@ -261,6 +289,7 @@ init (xlator_t *this)
char *qtype = NULL;
char *xattrs_list = NULL;
char *ptr = NULL;
+ char *fav_child_policy = NULL;
if (!this->children) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -340,6 +369,10 @@ init (xlator_t *this)
fav_child->name, fav_child->name);
}
+ GF_OPTION_INIT ("favorite-child-policy", fav_child_policy, str, out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
+
GF_OPTION_INIT ("shd-max-threads", priv->shd.max_threads,
uint32, out);
@@ -907,5 +940,18 @@ struct volume_options options[] = {
"granular way of recording changelogs and doing entry "
"self-heal.",
},
+ { .key = {"favorite-child-policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "size", "ctime", "mtime", "majority"},
+ .default_value = "none",
+ .description = "This option can be used to automatically resolve "
+ "split-brains using various policies without user "
+ "intervention. \"size\" picks the file with the "
+ "biggest size as the source. \"ctime\" and \"mtime\" "
+ "pick the file with the latest ctime and mtime "
+ "respectively as the source. \"majority\" picks a file"
+ " with identical mtime and size in more than half the "
+ "number of bricks in the replica.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 7d270ea94e7..5482dab25b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -50,6 +50,16 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
#define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;})
#define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
+
+typedef enum {
+ AFR_FAV_CHILD_NONE,
+ AFR_FAV_CHILD_BY_SIZE,
+ AFR_FAV_CHILD_BY_CTIME,
+ AFR_FAV_CHILD_BY_MTIME,
+ AFR_FAV_CHILD_BY_MAJORITY,
+ AFR_FAV_CHILD_POLICY_MAX,
+} afr_favorite_child_policy;
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -94,6 +104,9 @@ typedef struct _afr_private {
int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
+ afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic
+ resolution of split-brains.*/
+
gf_boolean_t inodelk_trace;
gf_boolean_t entrylk_trace;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 0600290cbe0..9f089e8dd3b 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1293,6 +1293,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_7_10,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.favorite-child-policy",
+ .voltype = "cluster/replicate",
+ .type = DOC,
+ .op_version = GD_OP_VERSION_3_7_12,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
/* stripe xlator options */
{ .key = "cluster.stripe-block-size",