diff options
-rw-r--r-- | tests/basic/halo-failover-disabled.t | 10 | ||||
-rw-r--r-- | tests/basic/halo-hybrid.t | 70 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 223 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 40 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 12 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 |
6 files changed, 337 insertions, 23 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t index 31a1d166404..4cc66e38de6 100644 --- a/tests/basic/halo-failover-disabled.t +++ b/tests/basic/halo-failover-disabled.t @@ -35,6 +35,10 @@ TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on TEST $CLI volume set $V0 cluster.self-heal-daemon on TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG + # Use a large ping time here so the spare brick is not marked up # based on the ping time. The only way it can get marked up is # by being swapped in via the down event (which is what we are disabling). @@ -47,8 +51,8 @@ cd $M0 # Write some data to the mount TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync -# Kill the first brick, fail-over to 3rd -TEST kill_brick $V0 $H0 $B0/${V0}0 +UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX} # Test that quorum should fail and the mount is RO, the reason here # is that although there _is_ another brick running which _could_ @@ -59,7 +63,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0 TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync TEST $CLI volume start $V0 force -sleep 2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX # Test that quorum should be restored and the file is writable TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t new file mode 100644 index 00000000000..4574fdfe41e --- /dev/null +++ b/tests/basic/halo-hybrid.t @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Test for the Halo hybrid feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +# heal daemon is off to start. +# 2. Write some data +# 3. Verify hybrid code chose children for lookups +# 4. Verify hybrid code chose child for reads +# 5. Verify hybrid code wrote synchronously to all replicas +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function found_fuse_log_msg { + local dir="$1" + local msg="$2" + local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l) + if (( $cnt == 1 )); then + echo "Y" + else + echo "N" + fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-hybrid-mode True +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level TRACE +TEST $CLI volume start $V0 + +# Start a synchronous mount +TEST glusterfs --volfile-id=/$V0 \ + --xlator-option *replicate*.halo-max-latency=9999 \ + --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 +sleep 2 +cd $M0 + +TEST mkdir testdir +TEST cd testdir +for i in {1..5} +do + dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null +done +TEST ls -l + +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs" +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child" + +B0_CNT=$(ls $B0/${V0}0/testdir | wc -l) +B1_CNT=$(ls $B0/${V0}1/testdir | wc -l) +B2_CNT=$(ls $B0/${V0}2/testdir | wc -l) + +# Writes should be synchronous, all should have same +# file count +TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))" + +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c4b6fd6a9b6..0c621271405 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,6 +45,10 @@ #define CHILD_UP_STR "UP" #define CHILD_DOWN_STR "DOWN" +#define CHILD_DISCONNECTED_STR "DOWN" + +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *fastest_children); call_frame_t * afr_copy_frame (call_frame_t *base) @@ -1371,21 +1375,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) sizeof(gfid_copy)) % child_count; } +/* + * afr_halo_read_subvol + * + * Given a array representing the readable children, this function will + * return which one of the readable children meet the halo hybrid criteria. + * In the event none are found, -1 is returned and another strategy will have + * to be used to figure out where the read should come from. + */ +int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) { + afr_private_t *priv = NULL; + unsigned char *hybrid_children; + int32_t hybrid_cnt = 0; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* Halo in-active or hybrid mode disabled, bail.... */ + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return -1; + + /* AFR Discovery edge case, if you are already pinned to a child + * which meets the latency threshold then go with this child for + * consistency purposes. + */ + if (priv->read_child >= 0 && readable[priv->read_child] && + priv->child_latency[priv->read_child] <= + AFR_HALO_HYBRID_LATENCY_MSEC) { + return priv->read_child; + } + + hybrid_children = alloca0 (priv->child_count); + hybrid_cnt = find_hybrid_children (this, hybrid_children); + if (hybrid_cnt) { + for (i = 0; i < priv->child_count; i++) { + if (readable[i] && hybrid_children[i]) { + read_subvol = i; + priv->read_child = read_subvol; + gf_log (this->name, GF_LOG_TRACE, + "Selected hybrid child %d for reads", + i); + break; + } + } + } + + return read_subvol; +} + int afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, unsigned char *readable, afr_read_subvol_args_t *args) { - int i = 0; - int read_subvol = -1; - afr_private_t *priv = NULL; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; afr_read_subvol_args_t local_args = {0,}; - priv = this->private; + priv = this->private; + + /* Choose lowest latency child for reads */ + read_subvol = afr_halo_read_subvol (this, readable); + if (read_subvol != -1) + return read_subvol; - /* first preference - explicitly specified or local subvolume */ - if (priv->read_child >= 0 && readable[priv->read_child]) + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) return priv->read_child; if (inode_is_linked (inode)) { @@ -1411,7 +1469,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, return -1; } - int afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, unsigned char *readable, int *event_p, @@ -2071,6 +2128,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); priv->read_child = child_index; + } else if (priv->halo_enabled) { + if (priv->read_child < 0) { + priv->read_child = child_index; + } else if (priv->child_latency[child_index] < + priv->child_latency[priv->read_child]) { + priv->read_child = child_index; + } } out: STACK_DESTROY(frame->root); @@ -2487,6 +2551,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) afr_local_t *local = NULL; afr_private_t *priv = NULL; int call_count = 0; + unsigned char *hybrid_children = NULL; local = frame->local; priv = this->private; @@ -2497,8 +2562,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) goto out; } - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); + hybrid_children = alloca0 (priv->child_count); + call_count = find_hybrid_children (this, hybrid_children); + if (call_count) { + for (i = 0; i < priv->child_count; i++) + local->child_up[i] = hybrid_children[i]; + gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid " + "children for LOOKUPs", call_count); + } else { + hybrid_children = NULL; + call_count = AFR_COUNT (local->child_up, priv->child_count); + } + + local->call_count = call_count; ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, &local->loc); @@ -2731,6 +2807,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get (loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); + /* So this is the "secret" to why "Hybrid" halo works. Encoded in + * the cached inodes, we store what is effectively the "generational" + * state of the cluster along with a "packed" version of the extended + * attributes which determine which nodes are wise/fools. We can + * consult these cached values to figure out who we can trust, in the + * event the state of our cluster changes and we can no longer trust + * the cached info we "refresh" the inode (and hit all regions) to + * ensure we know which bricks we can safely read from. + */ if (event != local->event_generation) afr_inode_refresh (frame, this, loc->parent, NULL, afr_lookup_do); @@ -4203,6 +4288,97 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +/* + * afr_cmp_child + * + * Passed to the qsort function to order a list of children by the latency + * and/or up/down states. + */ +static int +_afr_cmp_child (const void *child1, const void *child2) +{ + struct afr_child *child11 = (struct afr_child *)child1; + struct afr_child *child22 = (struct afr_child *)child2; + + if (child11->latency > child22->latency) { + return 1; + } + if (child11->latency == child22->latency) { + return 0; + } + return -1; +} + +/* + * find_hybrid_children + * + * Given a char array representing our children (aka bricks within our AFR + * AFR "subvolume"), we'll mark this array with the children which are + * within the halo_hybrid_read_max_latency_sec or if none fit this condition, + * we'll pick the fastest two bricks. + * + * You might ask, why not just pick the quickest brick and be done with it? + * Well, being within our set is not suffcient to be chosen for the read, + * we must also be marked "readable", we still want to choose as many as + * we can within our local region to ensure we have somebody that is readable. + * + * To illustrate this, consider the case where a 1/2 bricks received a sync + * from some other writer, and the 2nd brick although faster wasn't present. + * In this case we'll want to use the slower brick to service the read. + * + * In short, this function just tells the caller which hybrid children, + * it gives no signal as to their readability, nor should it since this is + * handled later in the various flows (e.g. by afr_halo_read_subvol). + */ +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *hybrid_children) +{ + int32_t i = 0; + afr_private_t *priv = NULL; + struct afr_child *sorted_list = NULL; + uint32_t max_latency; + uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT; + + priv = this->private; + + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return 0; + + if (limit > priv->child_count) + limit = priv->child_count; + + max_latency = priv->halo_hybrid_read_max_latency_msec; + + sorted_list = alloca (sizeof (struct afr_child) * priv->child_count); + + /* Find children meeting the latency threshold */ + for (i = 0; i < priv->child_count; i++) { + sorted_list[i].idx = i; + sorted_list[i].child_up = priv->child_up[i]; + sorted_list[i].latency = priv->child_latency[i]; + } + + /* QuickSort the children according to latency */ + qsort (sorted_list, priv->child_count, sizeof (struct afr_child), + _afr_cmp_child); + + i = 0; + while (i < priv->child_count && sorted_list[i].latency <= max_latency) + hybrid_children[sorted_list[i++].idx] = 1; + + /* Found some candidates */ + if (i != 0) + return i; + + /* If no candidates can be found meeting the max_latency threshold + * then find the best of those we have to our limit. + */ + for (i = 0; i < limit; i++) + hybrid_children[sorted_list[i].idx] = 1; + + return i; +} + int find_best_down_child (xlator_t *this) { @@ -4260,11 +4436,20 @@ static void dump_halo_states (xlator_t *this) { priv = this->private; for (i = 0; i < priv->child_count; i++) { - gf_log (this->name, GF_LOG_DEBUG, - "Child %d halo state: %s (%"PRIi64"ms)", - i, - priv->child_up[i] ? CHILD_UP_STR : CHILD_DOWN_STR, - priv->child_latency[i]); + if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (N/A)", + i, + priv->child_up[i] ? CHILD_UP_STR : + CHILD_DOWN_STR); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (%"PRIi64" ms)", + i, + priv->child_up[i] ? CHILD_UP_STR : + CHILD_DOWN_STR, + priv->child_latency[i]); + } } } @@ -4513,11 +4698,11 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, /* * If this is an _actual_ CHILD_DOWN event, we - * want to set the child_latency to < 0 to indicate - * the child is really disconnected. + * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to + * indicate the child is really disconnected. */ - if (child_latency_msec < 0) { - priv->child_latency[idx] = child_latency_msec; + if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) { + priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY; } priv->child_up[idx] = 0; @@ -4620,7 +4805,7 @@ afr_notify (xlator_t *this, int32_t event, gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; int64_t halo_max_latency_msec = 0; - int64_t child_latency_msec = -1; + int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY; gf_boolean_t child_halo_enabled = _gf_false; child_xlator = (xlator_t *)data; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index d4dd8ff8815..ae9b28c7fb4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -195,6 +195,14 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, options, uint32, out); + GF_OPTION_RECONF ("halo-hybrid-mode", + priv->halo_hybrid_mode, options, bool, + out); + + GF_OPTION_RECONF ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, options, + uint32, out); + GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, uint32, out); @@ -424,6 +432,13 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT ("halo-hybrid-mode", + priv->halo_hybrid_mode, bool, out); + + GF_OPTION_INIT ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, uint32, + out); + GF_OPTION_INIT ("halo-enabled", priv->halo_enabled, bool, out); @@ -505,12 +520,14 @@ init (xlator_t *this) goto out; } - for (i = 0; i < child_count; i++) + for (i = 0; i < child_count; i++) { + priv->child_latency[i] = 0.0; priv->child_up[i] = -1; /* start with unknown state. this initialization needed for afr_notify() to work reliably */ + } priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, gf_afr_mt_xlator_t); @@ -751,6 +768,27 @@ struct volume_options options[] = { .default_value = "5", .description = "Maximum latency for halo replication in msec." }, + { .key = {"halo-hybrid-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable hybrid sync mounts. When enabled, halo will " + "do write FOPs synchronously, and read FOPs will be " + "services in-region if the inode is clean/consistent." + "If no bricks can be found below " + "halo-hybrid-max-read-latency then the best 2 shall " + "be selected. This option can be used in " + "conjunction with all other halo options." + }, + { .key = {"halo-hybrid-read-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "8", + .description = "Maximum latency hybrid mode will use to select " + "children for read FOPs. Don't tune this unless " + "you really know what you are doing (i.e. you've " + "read/understand the associated source code)." + }, { .key = {"halo-max-replicas"}, .type = GF_OPTION_TYPE_INT, .min = 1, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f66cdbbf56a..d09aa6852c8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,9 @@ #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) +#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */ +#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */ +#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ @@ -65,6 +68,12 @@ struct afr_nfsd { uint32_t halo_max_latency_msec; }; +struct afr_child { + uint32_t idx; + int64_t latency; + unsigned char child_up; +}; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -149,7 +158,10 @@ typedef struct _afr_private { char *afr_dirty; gf_boolean_t halo_enabled; + /* Halo geo-replication tunables */ gf_boolean_t halo_failover_enabled; + gf_boolean_t halo_hybrid_mode; + uint32_t halo_hybrid_read_max_latency_msec; uint32_t halo_max_latency_msec; uint32_t halo_max_replicas; uint32_t halo_min_replicas; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index f28294301f8..bcb8877c5bd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3038,6 +3038,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.halo-hybrid-mode", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.halo-failover-enabled", .voltype = "cluster/replicate", .op_version = 2, |