diff options
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 101 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 17 |
3 files changed, 98 insertions, 27 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 96f13ce2cee..6863bd02c50 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1597,19 +1597,18 @@ out: } int -afr_least_pending_reads_child(afr_private_t *priv) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) { int i = 0; - int child = 0; + int child = -1; int64_t read_iter = -1; int64_t pending_read = -1; - pending_read = GF_ATOMIC_GET(priv->pending_reads[0]); - for (i = 1; i < priv->child_count; i++) { - if (AFR_IS_ARBITER_BRICK(priv, i)) + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) continue; read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); - if (read_iter < pending_read) { + if (child == -1 || read_iter < pending_read) { pending_read = read_iter; child = i; } @@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv) return child; } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; + } + } + return child; +} + +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; + + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; + } + } + return child; +} + int -afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) { uuid_t gfid_copy = { 0, @@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) int child = -1; switch (priv->hash_mode) { - case 0: + case AFR_READ_POLICY_FIRST_UP: break; - case 1: + case AFR_READ_POLICY_GFID_HASH: gf_uuid_copy(gfid_copy, args->gfid); child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 2: + case AFR_READ_POLICY_GFID_PID_HASH: if (args->ia_type != IA_IFDIR) { /* * Why getpid? Because it's one of the cheapest calls @@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 3: - child = afr_least_pending_reads_child(priv); + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); break; } @@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, } /* second preference - use hashed mode */ - read_subvol = afr_hash_child(&local_args, priv); + read_subvol = afr_hash_child(&local_args, priv, readable); if (read_subvol >= 0 && readable[read_subvol]) return read_subvol; @@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, * want to set the child_latency to MAX to indicate * the child needs ping data to be available before doing child-up */ - if (child_latency_msec < 0 && priv->halo_enabled) { + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { /*set to INT64_MAX-1 so that it is found for best_down_child*/ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; } @@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, "up_children (%d) > halo_max_replicas (%d)", worst_up_child, up_children, priv->halo_max_replicas); } - +out: if (up_children == 1) { gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, "Subvolume '%s' came back up; " @@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, * as we want it to be up to date if we are going to * begin using it synchronously. */ - if (up_children < priv->halo_min_replicas) { + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { best_down_child = find_best_down_child(this); if (best_down_child >= 0) { gf_msg_debug(this->name, 0, @@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, *up_child = best_down_child; } } - for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) down_children++; @@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) had_quorum = priv->quorum_count && afr_has_quorum(priv->child_up, this, NULL); - if (priv->halo_enabled) { - halo_max_latency_msec = afr_get_halo_latency(this); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency(this); - if (event == GF_EVENT_CHILD_PING) { /* Calculates the child latency and sets event */ - child_latency_msec = (int64_t)(uintptr_t)data2; LOCK(&priv->lock); { __afr_handle_ping_event(this, child_xlator, idx, @@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) child_latency_msec); } UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); } } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 67ff3409bb9..33a25cc5c0c 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -790,7 +790,7 @@ struct volume_options options[] = { {.key = {"read-hash-mode"}, .type = GF_OPTION_TYPE_INT, .min = 0, - .max = 3, + .max = 5, .default_value = "1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, @@ -803,7 +803,10 @@ struct volume_options options[] = { "1 = hash by GFID of file (all clients use " "same subvolume).\n" "2 = hash by GFID of file and client PID.\n" - "3 = brick having the least outstanding read requests."}, + "3 = brick having the least outstanding read requests.\n" + "4 = brick having the least network ping latency.\n" + "5 = Hybrid mode between 3 and 4, ie least value among " + "network-latency multiplied by outstanding-read-requests."}, { .key = {"choose-local"}, .type = GF_OPTION_TYPE_BOOL, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 1a409ec625b..db83b395e02 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -96,6 +96,15 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); } while (0) typedef enum { + AFR_READ_POLICY_FIRST_UP, + AFR_READ_POLICY_GFID_HASH, + AFR_READ_POLICY_GFID_PID_HASH, + AFR_READ_POLICY_LESS_LOAD, + AFR_READ_POLICY_LEAST_LATENCY, + AFR_READ_POLICY_LOAD_LATENCY_HYBRID, +} afr_read_hash_mode_t; + +typedef enum { AFR_FAV_CHILD_NONE, AFR_FAV_CHILD_BY_SIZE, AFR_FAV_CHILD_BY_CTIME, @@ -183,10 +192,10 @@ typedef struct _afr_private { gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ - unsigned int hash_mode; /* for when read_child is not set */ - gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ - int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ + afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ + gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic resolution of split-brains.*/ |