summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/halo-failover-disabled.t10
-rw-r--r--tests/basic/halo-hybrid.t70
-rw-r--r--xlators/cluster/afr/src/afr-common.c223
-rw-r--r--xlators/cluster/afr/src/afr.c40
-rw-r--r--xlators/cluster/afr/src/afr.h12
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c5
6 files changed, 337 insertions, 23 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
index 31a1d166404..4cc66e38de6 100644
--- a/tests/basic/halo-failover-disabled.t
+++ b/tests/basic/halo-failover-disabled.t
@@ -35,6 +35,10 @@ TEST $CLI volume set $V0 cluster.data-self-heal on
TEST $CLI volume set $V0 cluster.metadata-self-heal on
TEST $CLI volume set $V0 cluster.self-heal-daemon on
TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+
# Use a large ping time here so the spare brick is not marked up
# based on the ping time. The only way it can get marked up is
# by being swapped in via the down event (which is what we are disabling).
@@ -47,8 +51,8 @@ cd $M0
# Write some data to the mount
TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
-# Kill the first brick, fail-over to 3rd
-TEST kill_brick $V0 $H0 $B0/${V0}0
+UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX}
# Test that quorum should fail and the mount is RO, the reason here
# is that although there _is_ another brick running which _could_
@@ -59,7 +63,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0
TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
TEST $CLI volume start $V0 force
-sleep 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX
# Test that quorum should be restored and the file is writable
TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t
new file mode 100644
index 00000000000..4574fdfe41e
--- /dev/null
+++ b/tests/basic/halo-hybrid.t
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test for the Halo hybrid feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+# heal daemon is off to start.
+# 2. Write some data
+# 3. Verify hybrid code chose children for lookups
+# 4. Verify hybrid code chose child for reads
+# 5. Verify hybrid code wrote synchronously to all replicas
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function found_fuse_log_msg {
+ local dir="$1"
+ local msg="$2"
+ local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l)
+ if (( $cnt == 1 )); then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-hybrid-mode True
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level TRACE
+TEST $CLI volume start $V0
+
+# Start a synchronous mount
+TEST glusterfs --volfile-id=/$V0 \
+ --xlator-option *replicate*.halo-max-latency=9999 \
+ --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+sleep 2
+cd $M0
+
+TEST mkdir testdir
+TEST cd testdir
+for i in {1..5}
+do
+ dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+TEST ls -l
+
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs"
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child"
+
+B0_CNT=$(ls $B0/${V0}0/testdir | wc -l)
+B1_CNT=$(ls $B0/${V0}1/testdir | wc -l)
+B2_CNT=$(ls $B0/${V0}2/testdir | wc -l)
+
+# Writes should be synchronous, all should have same
+# file count
+TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))"
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index c4b6fd6a9b6..0c621271405 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,6 +45,10 @@
#define CHILD_UP_STR "UP"
#define CHILD_DOWN_STR "DOWN"
+#define CHILD_DISCONNECTED_STR "DOWN"
+
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *fastest_children);
call_frame_t *
afr_copy_frame (call_frame_t *base)
@@ -1371,21 +1375,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
sizeof(gfid_copy)) % child_count;
}
+/*
+ * afr_halo_read_subvol
+ *
+ * Given a array representing the readable children, this function will
+ * return which one of the readable children meet the halo hybrid criteria.
+ * In the event none are found, -1 is returned and another strategy will have
+ * to be used to figure out where the read should come from.
+ */
+int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) {
+ afr_private_t *priv = NULL;
+ unsigned char *hybrid_children;
+ int32_t hybrid_cnt = 0;
+ int read_subvol = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ /* Halo in-active or hybrid mode disabled, bail.... */
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return -1;
+
+ /* AFR Discovery edge case, if you are already pinned to a child
+ * which meets the latency threshold then go with this child for
+ * consistency purposes.
+ */
+ if (priv->read_child >= 0 && readable[priv->read_child] &&
+ priv->child_latency[priv->read_child] <=
+ AFR_HALO_HYBRID_LATENCY_MSEC) {
+ return priv->read_child;
+ }
+
+ hybrid_children = alloca0 (priv->child_count);
+ hybrid_cnt = find_hybrid_children (this, hybrid_children);
+ if (hybrid_cnt) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i] && hybrid_children[i]) {
+ read_subvol = i;
+ priv->read_child = read_subvol;
+ gf_log (this->name, GF_LOG_TRACE,
+ "Selected hybrid child %d for reads",
+ i);
+ break;
+ }
+ }
+ }
+
+ return read_subvol;
+}
+
int
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
unsigned char *readable,
afr_read_subvol_args_t *args)
{
- int i = 0;
- int read_subvol = -1;
- afr_private_t *priv = NULL;
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
afr_read_subvol_args_t local_args = {0,};
- priv = this->private;
+ priv = this->private;
+
+ /* Choose lowest latency child for reads */
+ read_subvol = afr_halo_read_subvol (this, readable);
+ if (read_subvol != -1)
+ return read_subvol;
- /* first preference - explicitly specified or local subvolume */
- if (priv->read_child >= 0 && readable[priv->read_child])
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
return priv->read_child;
if (inode_is_linked (inode)) {
@@ -1411,7 +1469,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
return -1;
}
-
int
afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
unsigned char *readable, int *event_p,
@@ -2071,6 +2128,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv->children[child_index]->name);
priv->read_child = child_index;
+ } else if (priv->halo_enabled) {
+ if (priv->read_child < 0) {
+ priv->read_child = child_index;
+ } else if (priv->child_latency[child_index] <
+ priv->child_latency[priv->read_child]) {
+ priv->read_child = child_index;
+ }
}
out:
STACK_DESTROY(frame->root);
@@ -2487,6 +2551,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int call_count = 0;
+ unsigned char *hybrid_children = NULL;
local = frame->local;
priv = this->private;
@@ -2497,8 +2562,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
goto out;
}
- call_count = local->call_count = AFR_COUNT (local->child_up,
- priv->child_count);
+ hybrid_children = alloca0 (priv->child_count);
+ call_count = find_hybrid_children (this, hybrid_children);
+ if (call_count) {
+ for (i = 0; i < priv->child_count; i++)
+ local->child_up[i] = hybrid_children[i];
+ gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid "
+ "children for LOOKUPs", call_count);
+ } else {
+ hybrid_children = NULL;
+ call_count = AFR_COUNT (local->child_up, priv->child_count);
+ }
+
+ local->call_count = call_count;
ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
&local->loc);
@@ -2731,6 +2807,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
+ /* So this is the "secret" to why "Hybrid" halo works. Encoded in
+ * the cached inodes, we store what is effectively the "generational"
+ * state of the cluster along with a "packed" version of the extended
+ * attributes which determine which nodes are wise/fools. We can
+ * consult these cached values to figure out who we can trust, in the
+ * event the state of our cluster changes and we can no longer trust
+ * the cached info we "refresh" the inode (and hit all regions) to
+ * ensure we know which bricks we can safely read from.
+ */
if (event != local->event_generation)
afr_inode_refresh (frame, this, loc->parent, NULL,
afr_lookup_do);
@@ -4203,6 +4288,97 @@ __get_heard_from_all_status (xlator_t *this)
return heard_from_all;
}
+/*
+ * afr_cmp_child
+ *
+ * Passed to the qsort function to order a list of children by the latency
+ * and/or up/down states.
+ */
+static int
+_afr_cmp_child (const void *child1, const void *child2)
+{
+ struct afr_child *child11 = (struct afr_child *)child1;
+ struct afr_child *child22 = (struct afr_child *)child2;
+
+ if (child11->latency > child22->latency) {
+ return 1;
+ }
+ if (child11->latency == child22->latency) {
+ return 0;
+ }
+ return -1;
+}
+
+/*
+ * find_hybrid_children
+ *
+ * Given a char array representing our children (aka bricks within our AFR
+ * AFR "subvolume"), we'll mark this array with the children which are
+ * within the halo_hybrid_read_max_latency_sec or if none fit this condition,
+ * we'll pick the fastest two bricks.
+ *
+ * You might ask, why not just pick the quickest brick and be done with it?
+ * Well, being within our set is not suffcient to be chosen for the read,
+ * we must also be marked "readable", we still want to choose as many as
+ * we can within our local region to ensure we have somebody that is readable.
+ *
+ * To illustrate this, consider the case where a 1/2 bricks received a sync
+ * from some other writer, and the 2nd brick although faster wasn't present.
+ * In this case we'll want to use the slower brick to service the read.
+ *
+ * In short, this function just tells the caller which hybrid children,
+ * it gives no signal as to their readability, nor should it since this is
+ * handled later in the various flows (e.g. by afr_halo_read_subvol).
+ */
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *hybrid_children)
+{
+ int32_t i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_child *sorted_list = NULL;
+ uint32_t max_latency;
+ uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT;
+
+ priv = this->private;
+
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return 0;
+
+ if (limit > priv->child_count)
+ limit = priv->child_count;
+
+ max_latency = priv->halo_hybrid_read_max_latency_msec;
+
+ sorted_list = alloca (sizeof (struct afr_child) * priv->child_count);
+
+ /* Find children meeting the latency threshold */
+ for (i = 0; i < priv->child_count; i++) {
+ sorted_list[i].idx = i;
+ sorted_list[i].child_up = priv->child_up[i];
+ sorted_list[i].latency = priv->child_latency[i];
+ }
+
+ /* QuickSort the children according to latency */
+ qsort (sorted_list, priv->child_count, sizeof (struct afr_child),
+ _afr_cmp_child);
+
+ i = 0;
+ while (i < priv->child_count && sorted_list[i].latency <= max_latency)
+ hybrid_children[sorted_list[i++].idx] = 1;
+
+ /* Found some candidates */
+ if (i != 0)
+ return i;
+
+ /* If no candidates can be found meeting the max_latency threshold
+ * then find the best of those we have to our limit.
+ */
+ for (i = 0; i < limit; i++)
+ hybrid_children[sorted_list[i].idx] = 1;
+
+ return i;
+}
+
int
find_best_down_child (xlator_t *this)
{
@@ -4260,11 +4436,20 @@ static void dump_halo_states (xlator_t *this) {
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Child %d halo state: %s (%"PRIi64"ms)",
- i,
- priv->child_up[i] ? CHILD_UP_STR : CHILD_DOWN_STR,
- priv->child_latency[i]);
+ if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (N/A)",
+ i,
+ priv->child_up[i] ? CHILD_UP_STR :
+ CHILD_DOWN_STR);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (%"PRIi64" ms)",
+ i,
+ priv->child_up[i] ? CHILD_UP_STR :
+ CHILD_DOWN_STR,
+ priv->child_latency[i]);
+ }
}
}
@@ -4513,11 +4698,11 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
/*
* If this is an _actual_ CHILD_DOWN event, we
- * want to set the child_latency to < 0 to indicate
- * the child is really disconnected.
+ * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to
+ * indicate the child is really disconnected.
*/
- if (child_latency_msec < 0) {
- priv->child_latency[idx] = child_latency_msec;
+ if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) {
+ priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY;
}
priv->child_up[idx] = 0;
@@ -4620,7 +4805,7 @@ afr_notify (xlator_t *this, int32_t event,
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
int64_t halo_max_latency_msec = 0;
- int64_t child_latency_msec = -1;
+ int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY;
gf_boolean_t child_halo_enabled = _gf_false;
child_xlator = (xlator_t *)data;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index d4dd8ff8815..ae9b28c7fb4 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -195,6 +195,14 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec,
options, uint32, out);
+ GF_OPTION_RECONF ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, options,
+ uint32, out);
+
GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options,
uint32, out);
@@ -424,6 +432,13 @@ init (xlator_t *this)
GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
+ GF_OPTION_INIT ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, bool, out);
+
+ GF_OPTION_INIT ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, uint32,
+ out);
+
GF_OPTION_INIT ("halo-enabled",
priv->halo_enabled, bool, out);
@@ -505,12 +520,14 @@ init (xlator_t *this)
goto out;
}
- for (i = 0; i < child_count; i++)
+ for (i = 0; i < child_count; i++) {
+ priv->child_latency[i] = 0.0;
priv->child_up[i] = -1; /* start with unknown state.
this initialization needed
for afr_notify() to work
reliably
*/
+ }
priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
gf_afr_mt_xlator_t);
@@ -751,6 +768,27 @@ struct volume_options options[] = {
.default_value = "5",
.description = "Maximum latency for halo replication in msec."
},
+ { .key = {"halo-hybrid-mode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enable hybrid sync mounts. When enabled, halo will "
+ "do write FOPs synchronously, and read FOPs will be "
+ "services in-region if the inode is clean/consistent."
+ "If no bricks can be found below "
+ "halo-hybrid-max-read-latency then the best 2 shall "
+ "be selected. This option can be used in "
+ "conjunction with all other halo options."
+ },
+ { .key = {"halo-hybrid-read-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "8",
+ .description = "Maximum latency hybrid mode will use to select "
+ "children for read FOPs. Don't tune this unless "
+ "you really know what you are doing (i.e. you've "
+ "read/understand the associated source code)."
+ },
{ .key = {"halo-max-replicas"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f66cdbbf56a..d09aa6852c8 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -30,6 +30,9 @@
#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
+#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */
+#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */
+#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */
#define AFR_LOCKEE_COUNT_MAX 3
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
@@ -65,6 +68,12 @@ struct afr_nfsd {
uint32_t halo_max_latency_msec;
};
+struct afr_child {
+ uint32_t idx;
+ int64_t latency;
+ unsigned char child_up;
+};
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -149,7 +158,10 @@ typedef struct _afr_private {
char *afr_dirty;
gf_boolean_t halo_enabled;
+ /* Halo geo-replication tunables */
gf_boolean_t halo_failover_enabled;
+ gf_boolean_t halo_hybrid_mode;
+ uint32_t halo_hybrid_read_max_latency_msec;
uint32_t halo_max_latency_msec;
uint32_t halo_max_replicas;
uint32_t halo_min_replicas;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index f28294301f8..bcb8877c5bd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3038,6 +3038,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 2,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.halo-hybrid-mode",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "cluster.halo-failover-enabled",
.voltype = "cluster/replicate",
.op_version = 2,