From d9f9be442808ed13135f560698a2f95fe66282a5 Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Wed, 27 Sep 2017 13:18:25 -0700 Subject: self-heal: fix automatic split-brain resolution options Differential Revision: https://phabricator.intern.facebook.com/D5927193 Change-Id: Ife04c8738b9ee721e7be9bc843b2f6d54bbb468e --- tests/basic/afr/gfid-unsplit-type-mismatch.t | 5 +- tests/basic/afr/gfid-unsplit.t | 5 +- tests/basic/afr/shd-autofix-nogfid.t | 5 +- tests/basic/afr/shd-force-inspect.t | 5 +- tests/basic/afr/shd-pgfid-heal.t | 5 +- tests/bugs/fb2506544_ctime.t | 8 +- tests/bugs/fb2506544_majority.t | 9 +- tests/bugs/fb2506544_mtime.t | 8 +- tests/bugs/fb2506544_size.t | 8 +- xlators/cluster/afr/src/afr-self-heal-common.c | 65 +++++++------ xlators/cluster/afr/src/afr.c | 122 ++++++++++++++++++++++-- xlators/cluster/afr/src/afr.h | 21 ++-- xlators/mgmt/glusterd/src/glusterd-volume-set.c | 24 +++++ 13 files changed, 206 insertions(+), 84 deletions(-) diff --git a/tests/basic/afr/gfid-unsplit-type-mismatch.t b/tests/basic/afr/gfid-unsplit-type-mismatch.t index 9e205021a0d..172645d3fae 100644 --- a/tests/basic/afr/gfid-unsplit-type-mismatch.t +++ b/tests/basic/afr/gfid-unsplit-type-mismatch.t @@ -15,9 +15,8 @@ TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 nfs.disable on TEST $CLI volume set $V0 cluster.quorum-type none -TEST $CLI volume set $V0 cluster.favorite-child-policy mtime -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off diff --git a/tests/basic/afr/gfid-unsplit.t b/tests/basic/afr/gfid-unsplit.t index 0b883ab658f..d6cb7e74a8a 100644 --- a/tests/basic/afr/gfid-unsplit.t +++ b/tests/basic/afr/gfid-unsplit.t @@ -17,9 +17,8 @@ TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume set $V0 cluster.quorum-type none TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 nfs.disable off -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on -TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off diff --git a/tests/basic/afr/shd-autofix-nogfid.t b/tests/basic/afr/shd-autofix-nogfid.t index 8bc2c965640..f54a8eb8600 100644 --- a/tests/basic/afr/shd-autofix-nogfid.t +++ b/tests/basic/afr/shd-autofix-nogfid.t @@ -15,9 +15,8 @@ TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume set $V0 cluster.self-heal-daemon on TEST $CLI volume set $V0 nfs.disable on TEST $CLI volume set $V0 cluster.quorum-type auto -TEST $CLI volume set $V0 cluster.favorite-child-policy majority -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off diff --git a/tests/basic/afr/shd-force-inspect.t b/tests/basic/afr/shd-force-inspect.t index caceb841322..c8b027c8933 100644 --- a/tests/basic/afr/shd-force-inspect.t +++ b/tests/basic/afr/shd-force-inspect.t @@ -15,9 +15,8 @@ TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume set $V0 cluster.self-heal-daemon on TEST $CLI volume set $V0 nfs.disable on TEST $CLI volume set $V0 cluster.quorum-type none -TEST $CLI volume set $V0 cluster.favorite-child-policy majority -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off diff --git a/tests/basic/afr/shd-pgfid-heal.t b/tests/basic/afr/shd-pgfid-heal.t index 6213e4c6374..d723fe436a6 100644 --- a/tests/basic/afr/shd-pgfid-heal.t +++ b/tests/basic/afr/shd-pgfid-heal.t @@ -15,10 +15,9 @@ TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume set $V0 cluster.self-heal-daemon on TEST $CLI volume set $V0 nfs.disable on TEST $CLI volume set $V0 cluster.quorum-type none -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.pgfid-self-heal on -TEST $CLI volume set $V0 cluster.favorite-child-policy majority TEST $CLI volume set $V0 storage.build-pgfid on TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.data-self-heal off diff --git a/tests/bugs/fb2506544_ctime.t b/tests/bugs/fb2506544_ctime.t index 8c7ab02cc8e..f59525db590 100755 --- a/tests/bugs/fb2506544_ctime.t +++ b/tests/bugs/fb2506544_ctime.t @@ -16,7 +16,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.entry-self-heal on -#EST $CLI volume set $V0 cluster.favorite-child-by-ctime off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -61,8 +60,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # The MD5 should be of the file which was created first. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-ctime on -TEST $CLI volume set $V0 cluster.favorite-child-policy ctime +TEST $CLI volume set $V0 cluster.favorite-child-by-ctime on sleep 1 # Mount the volume TEST glusterfs --log-level DEBUG --volfile-id=/$V0 --volfile-server=$H0 $M0 \ @@ -85,7 +83,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.entry-self-heal on -#EST $CLI volume set $V0 cluster.favorite-child-by-ctime off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -131,8 +128,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # The MD5 should be of the file which was created first. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-ctime on -TEST $CLI volume set $V0 cluster.favorite-child-policy ctime +TEST $CLI volume set $V0 cluster.favorite-child-by-ctime on TEST $CLI volume set $V0 cluster.self-heal-daemon on sleep 1 /etc/init.d/glusterd restart_shd diff --git a/tests/bugs/fb2506544_majority.t b/tests/bugs/fb2506544_majority.t index c38a6d59947..a2c489a4063 100755 --- a/tests/bugs/fb2506544_majority.t +++ b/tests/bugs/fb2506544_majority.t @@ -16,8 +16,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.entry-self-heal on -#EST $CLI volume set $V0 cluster.favorite-child-by-majority off -TEST $CLI volume set $V0 cluster.favorite-child-policy majority # This would normally be a toxic combination because it allows us to create a # split brain by writing to 1/3 replicas ... but for testing that's exactly # what we want. @@ -67,8 +65,7 @@ sleep 1 # Compare MD5's, the healed file should be that of the file which is # on 2/3 bricks. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on sleep 1 # Mount the volume TEST glusterfs --log-level DEBUG --volfile-id=/$V0 --volfile-server=$H0 $M0 \ @@ -91,7 +88,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off -#EST $CLI volume set $V0 cluster.favorite-child-by-majority off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -139,8 +135,7 @@ sleep 1 # Compare MD5's, the healed file should be that of the file which is # on 2/3 bricks. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-majority on -TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume set $V0 cluster.favorite-child-by-majority on TEST $CLI volume set $V0 cluster.self-heal-daemon on sleep 1 /etc/init.d/glusterd restart_shd diff --git a/tests/bugs/fb2506544_mtime.t b/tests/bugs/fb2506544_mtime.t index b908fdaddd5..b68c6b2e089 100755 --- a/tests/bugs/fb2506544_mtime.t +++ b/tests/bugs/fb2506544_mtime.t @@ -16,7 +16,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.entry-self-heal on -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -62,8 +61,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # Compare MD5's, the MD5 should be of the file we modified last. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on -TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on sleep 1 # Mount the volume TEST glusterfs --log-level DEBUG --volfile-id=/$V0 --volfile-server=$H0 $M0 \ @@ -86,7 +84,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -134,8 +131,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # Compare MD5's, the MD5 should be of the file we modified last. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on -TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume set $V0 cluster.favorite-child-by-mtime on TEST $CLI volume set $V0 cluster.self-heal-daemon on sleep 1 /etc/init.d/glusterd restart_shd diff --git a/tests/bugs/fb2506544_size.t b/tests/bugs/fb2506544_size.t index 593c1053853..731ffd13f56 100755 --- a/tests/bugs/fb2506544_size.t +++ b/tests/bugs/fb2506544_size.t @@ -16,7 +16,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.entry-self-heal on -#EST $CLI volume set $V0 cluster.favorite-child-by-size off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -61,8 +60,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # Compare MD5's, the MD5 should be of the file that is the largest. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-size on -TEST $CLI volume set $V0 cluster.favorite-child-policy size +TEST $CLI volume set $V0 cluster.favorite-child-by-size on sleep 1 # Mount the volume TEST glusterfs --log-level DEBUG --volfile-id=/$V0 --volfile-server=$H0 $M0 \ @@ -88,7 +86,6 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.data-self-heal off TEST $CLI volume set $V0 cluster.metadata-self-heal off TEST $CLI volume set $V0 cluster.entry-self-heal off -#EST $CLI volume set $V0 cluster.favorite-child-by-size off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 1 TEST $CLI volume start $V0 @@ -135,8 +132,7 @@ sleep 1 # Ok now turn the favorite-child option and we should be able to read it. # Compare MD5's, the MD5 should be of the file that is the largest. umount $M0 -#EST $CLI volume set $V0 cluster.favorite-child-by-size on -TEST $CLI volume set $V0 cluster.favorite-child-policy size +TEST $CLI volume set $V0 cluster.favorite-child-by-size on TEST $CLI volume set $V0 cluster.self-heal-daemon on sleep 1 /etc/init.d/glusterd restart_shd diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 5a9ab795a94..6d123bf407f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -867,12 +867,30 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) return fav_child; } + +typedef struct { + int (*func) (xlator_t *this, + struct afr_reply *replies, + inode_t *inode); + char *name; +} _policy_pair; + +static _policy_pair afr_sh_fav_child_policies[AFR_FAV_CHILD_POLICY_MAX] = { + [AFR_FAV_CHILD_BY_MAJORITY] = { afr_sh_fav_by_majority, + "MAJORITY" }, + [AFR_FAV_CHILD_BY_MTIME] = { afr_sh_fav_by_mtime, "MTIME" }, + [AFR_FAV_CHILD_BY_CTIME] = { afr_sh_fav_by_ctime, "CTIME" }, + [AFR_FAV_CHILD_BY_SIZE] = { afr_sh_fav_by_size, "SIZE" }, +}; + int afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies, inode_t *inode, char **policy_str) { afr_private_t *priv = NULL; - int fav_child = -1; + int fav_child; + int pol_index; + _policy_pair *policy; priv = this->private; if (!afr_can_decide_split_brain_source_sinks (replies, @@ -880,37 +898,26 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies, return -1; } - switch (priv->fav_child_policy) { - case AFR_FAV_CHILD_BY_SIZE: - fav_child = afr_sh_fav_by_size (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "SIZE"; - } - break; - case AFR_FAV_CHILD_BY_CTIME: - fav_child = afr_sh_fav_by_ctime (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "CTIME"; - } - break; - case AFR_FAV_CHILD_BY_MTIME: - fav_child = afr_sh_fav_by_mtime (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "MTIME"; - } - break; - case AFR_FAV_CHILD_BY_MAJORITY: - fav_child = afr_sh_fav_by_majority (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "MAJORITY"; + pol_index = AFR_FAV_CHILD_NONE + 1; + while (pol_index < AFR_FAV_CHILD_POLICY_MAX) { + if (priv->fav_child_policy & (1 << pol_index)) { + policy = &afr_sh_fav_child_policies[pol_index]; + gf_log (this->name, GF_LOG_TRACE, + "trying policy %s", policy->name); + fav_child = policy->func (this, replies, inode); + if (fav_child >= 0) { + gf_log (this->name, GF_LOG_TRACE, + "policy %s WORKED", policy->name); + if (policy_str) { + *policy_str = policy->name; + } + return fav_child; + } } - break; - case AFR_FAV_CHILD_NONE: - default: - break; + ++pol_index; } - return fav_child; + return -1; } int diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index f291626fff9..a917efbbcb6 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -112,14 +112,29 @@ fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype, int afr_set_favorite_child_policy (afr_private_t *priv, char *policy) { - int index = -1; - - index = gf_get_index_by_elem (afr_favorite_child_policies, policy); - if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) - return -1; - - priv->fav_child_policy = index; + char *token; + char *saveptr; + uint32_t retval = 0; + + if (strcasecmp (policy, "none") != 0) { + token = strtok_r (policy, ",", &saveptr); + while (token) { + if (strcasecmp (token, "majority") == 0) { + retval |= (1 << AFR_FAV_CHILD_BY_MAJORITY); + } else if (strcasecmp (token, "mtime") == 0) { + retval |= (1 << AFR_FAV_CHILD_BY_MTIME); + } else if (strcasecmp (token, "ctime") == 0) { + retval |= (1 << AFR_FAV_CHILD_BY_CTIME); + } else if (strcasecmp (token, "size") == 0) { + retval |= (1 << AFR_FAV_CHILD_BY_SIZE); + } else { + return -1; + } + token = strtok_r (NULL, ",", &saveptr); + } + } + priv->fav_child_policy = retval; return 0; } int @@ -132,6 +147,7 @@ reconfigure (xlator_t *this, dict_t *options) int index = -1; char *qtype = NULL; char *fav_child_policy = NULL; + gf_boolean_t policy_flag; priv = this->private; @@ -302,6 +318,33 @@ reconfigure (xlator_t *this, dict_t *options) if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1) goto out; + GF_OPTION_RECONF ("favorite-child-by-majority", policy_flag, options, + bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_MAJORITY); + } + + GF_OPTION_RECONF ("favorite-child-by-mtime", policy_flag, options, + bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_MTIME); + } + + GF_OPTION_RECONF ("favorite-child-by-ctime", policy_flag, options, + bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_CTIME); + } + + GF_OPTION_RECONF ("favorite-child-by-size", policy_flag, options, + bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_SIZE); + } + + gf_log (this->name, GF_LOG_DEBUG, "fav_child policy = 0x%x", + priv->fav_child_policy); + priv->did_local_discovery = _gf_false; priv->did_discovery = _gf_false; @@ -335,6 +378,7 @@ init (xlator_t *this) xlator_t *fav_child = NULL; char *qtype = NULL; char *fav_child_policy = NULL; + gf_boolean_t policy_flag; if (!this->children) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -421,6 +465,29 @@ init (xlator_t *this) if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) goto out; + GF_OPTION_INIT ("favorite-child-by-majority", policy_flag, bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_MAJORITY); + } + + GF_OPTION_INIT ("favorite-child-by-mtime", policy_flag, bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_MTIME); + } + + GF_OPTION_INIT ("favorite-child-by-ctime", policy_flag, bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_CTIME); + } + + GF_OPTION_INIT ("favorite-child-by-size", policy_flag, bool, out); + if (policy_flag) { + priv->fav_child_policy |= (1 << AFR_FAV_CHILD_BY_SIZE); + } + + gf_log (this->name, GF_LOG_DEBUG, "fav_child policy = 0x%x", + priv->fav_child_policy); + GF_OPTION_INIT ("shd-max-threads", priv->shd.max_threads, uint32, out); @@ -1102,7 +1169,6 @@ struct volume_options options[] = { }, { .key = {"favorite-child-policy"}, .type = GF_OPTION_TYPE_STR, - .value = {"none", "size", "ctime", "mtime", "majority"}, .default_value = "none", .description = "This option can be used to automatically resolve " "split-brains using various policies without user " @@ -1111,7 +1177,45 @@ struct volume_options options[] = { "pick the file with the latest ctime and mtime " "respectively as the source. \"majority\" picks a file" " with identical mtime and size in more than half the " - "number of bricks in the replica.", + "number of bricks in the replica. More than one " + "policy can be specified, separated by commas. The " + "order of attempted application (regardless of the " + "specification order) is: majority, mtime, ctime, " + "size. The value set here can be modified by the " + "favorite-child-by-xxx options." + }, + { .key = {"favorite-child-by-majority"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "Allow automatic resolution of split-brains by " + "majority (more than half of the copies with same " + "mtime and size). This can be combined with other " + "favorite-child-by-xxx options, and can modify the " + "value set by favorite-child-policy." + }, + { .key = {"favorite-child-by-mtime"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "Allow automatic resolution of split-brains by " + "latest mtime. This can be combined with other " + "favorite-child-by-xxx options, and can modify the " + "value set by favorite-child-policy." + }, + { .key = {"favorite-child-by-ctime"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "Allow automatic resolution of split-brains by " + "latest ctime. This can be combined with other " + "favorite-child-by-xxx options, and can modify the " + "value set by favorite-child-policy." + }, + { .key = {"favorite-child-by-size"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "Allow automatic resolution of split-brains by " + "greatest size. This can be combined with other " + "favorite-child-by-xxx options, and can modify the " + "value set by favorite-child-policy." }, { .key = {"pgfid-self-heal"}, .type = GF_OPTION_TYPE_BOOL, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 3314f865781..f6b8fa1b8b8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -68,12 +68,16 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); uuid_utoa (local->inode->gfid)); \ } while (0) +/* + * These *must* be defined in order of decreasing precedence in order for + * afr_sh_get_fav_by_policy to work correctly. + */ typedef enum { - AFR_FAV_CHILD_NONE, - AFR_FAV_CHILD_BY_SIZE, - AFR_FAV_CHILD_BY_CTIME, + AFR_FAV_CHILD_NONE = 0, + AFR_FAV_CHILD_BY_MAJORITY, /* Highest precedence. */ AFR_FAV_CHILD_BY_MTIME, - AFR_FAV_CHILD_BY_MAJORITY, + AFR_FAV_CHILD_BY_CTIME, + AFR_FAV_CHILD_BY_SIZE, /* Lowest precedence. */ AFR_FAV_CHILD_POLICY_MAX, } afr_favorite_child_policy; @@ -136,8 +140,13 @@ typedef struct _afr_private { int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ - afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic - resolution of split-brains.*/ + /* + * Policy to use for automatic resolution of split-brains. This needs + * to be a bit-field so that we can iterate over multiple policies when + * earlier ones yield ties. The actual bits used are (1 << X) where X + * is one of the enum values from afr_favorite_child_policy. + */ + uint32_t fav_child_policy; gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 24f4f62ce59..cd42cf75756 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1313,6 +1313,30 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_12, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.favorite-child-by-majority", + .voltype = "cluster/replicate", + .type = DOC, + .op_version = GD_OP_VERSION_3_7_12, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.favorite-child-by-mtime", + .voltype = "cluster/replicate", + .type = DOC, + .op_version = GD_OP_VERSION_3_7_12, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.favorite-child-by-ctime", + .voltype = "cluster/replicate", + .type = DOC, + .op_version = GD_OP_VERSION_3_7_12, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.favorite-child-by-size", + .voltype = "cluster/replicate", + .type = DOC, + .op_version = GD_OP_VERSION_3_7_12, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.pgfid-self-heal", .voltype = "cluster/replicate", .op_version = 2, -- cgit