From ddc044bfa2840981de4003c3b9efcac84387dc2b Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Mon, 12 Mar 2012 09:32:40 -0400 Subject: replicate: add hashed read-child method. Both the first-to-respond method and the round-robin method are susceptible to clients repeatedly choosing the same servers across a series of opens, creating hot spots. Also, the code to handle a replica being down will ignore both methods and just choose the first remaining (which is not an issue for two-way but can be otherwise). The hashed method more reliably avoids such hot spots. There are three values/modes. 0: use the old (broken) methods. 1: select a read-child based on a hash of the file's GFID, so all clients will choose the same subvolume for a file (ensuring maximum consistency) but will distribute load for a set of files. 2: select a read-child based on a hash of the file's GFID plus the client's PID, so different children will distribute load even for one file. Mode 2 will probably be optimal for most cases. Using response time when we open the file is problematic, both because a single sample might not have been representative even then and because load might have shifted in the hours or days since (for long-lived files). Trying to use more current load information can lead to "herd following" behavior which is just as bad. Pseudo-random distribution is likely to be the best we can reasonably do, just as it is for DHT. Change-Id: I798c2760411eacf32e82a85f03bb7b08a4a49461 BUG: 802513 Signed-off-by: Jeff Darcy Reviewed-on: http://review.gluster.com/2926 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- xlators/cluster/afr/src/afr-common.c | 67 ++++++++++++++++++++++++---- xlators/cluster/afr/src/afr-dir-write.c | 15 ++++--- xlators/cluster/afr/src/afr-self-heal-data.c | 6 ++- xlators/cluster/afr/src/afr-self-heal.h | 3 +- xlators/cluster/afr/src/afr.c | 14 ++++++ xlators/cluster/afr/src/afr.h | 6 ++- 6 files changed, 93 insertions(+), 18 deletions(-) (limited to 'xlators/cluster/afr/src') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 300ab92ef..21a2be3dd 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -546,6 +546,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources, gf_boolean_t success_child = _gf_false; gf_boolean_t source = _gf_false; + if (child < 0) { + return _gf_false; + } + GF_ASSERT (success_children); GF_ASSERT (child_count > 0); @@ -562,13 +566,44 @@ out: return (success_child && source); } +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, + unsigned int hmode, uuid_t gfid) +{ + uuid_t gfid_copy = {0,}; + + if (!hmode) { + return -1; + } + + if (gfid) { + uuid_copy(gfid_copy,gfid); + } + if (hmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + *((pid_t *)gfid_copy) = getpid(); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} + /* If sources is NULL the xattrs are assumed to be of source for all * success_children. */ int -afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources) +afr_select_read_child_from_policy (int32_t *success_children, + int32_t child_count, int32_t prev_read_child, + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid) { int32_t read_child = -1; int i = 0; @@ -585,6 +620,13 @@ afr_select_read_child_from_policy (int32_t *success_children, int32_t child_coun read_child)) goto out; + read_child = afr_hash_child (success_children, child_count, + hmode, gfid); + if (afr_is_read_child (success_children, sources, child_count, + read_child)) { + goto out; + } + for (i = 0; i < child_count; i++) { read_child = success_children[i]; if (read_child < 0) @@ -604,7 +646,7 @@ out: void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child) + int32_t config_read_child, uuid_t gfid) { int read_child = -1; afr_private_t *priv = NULL; @@ -614,7 +656,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, priv->child_count, prev_read_child, config_read_child, - NULL); + NULL, + priv->hash_mode, gfid); if (read_child >= 0) afr_inode_set_read_ctx (this, inode, read_child, fresh_children); @@ -1271,6 +1314,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, dict_t **xattrs = NULL; int32_t *success_children = NULL; afr_transaction_type type = AFR_METADATA_TRANSACTION; + uuid_t *gfid = NULL; GF_ASSERT (local); GF_ASSERT (this); @@ -1284,8 +1328,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; type = afr_transaction_type_get (ia_type); xattrs = local->cont.lookup.xattrs; + gfid = &local->cont.lookup.buf.ia_gfid; source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type); + type, *gfid); if (source < 0) { gf_log (this->name, GF_LOG_DEBUG, "failed to select source " "for %s", local->loc.path); @@ -2131,8 +2176,14 @@ afr_lookup (call_frame_t *frame, xlator_t *this, } else { LOCK (&priv->read_child_lock); { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); + if (priv->hash_mode) { + local->read_child_index = -1; + } + else { + local->read_child_index = + (++priv->read_child_rr) % + (priv->child_count); + } } UNLOCK (&priv->read_child_lock); local->cont.lookup.fresh_lookup = _gf_true; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 9f2b975df..0b804bef5 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -196,7 +196,8 @@ unlock: afr_set_read_ctx_from_policy (this, inode, local->fresh_children, local->read_child_index, - priv->read_child); + priv->read_child, + local->cont.create.buf.ia_gfid); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -429,7 +430,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_set_read_ctx_from_policy (this, inode, local->fresh_children, local->read_child_index, - priv->read_child); + priv->read_child, + local->cont.mknod.buf.ia_gfid); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -657,7 +659,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_set_read_ctx_from_policy (this, inode, local->fresh_children, local->read_child_index, - priv->read_child); + priv->read_child, + local->cont.mkdir.buf.ia_gfid); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -887,7 +890,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_set_read_ctx_from_policy (this, inode, local->fresh_children, local->read_child_index, - priv->read_child); + priv->read_child, + local->cont.link.buf.ia_gfid); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -1110,7 +1114,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_set_read_ctx_from_policy (this, inode, local->fresh_children, local->read_child_index, - priv->read_child); + priv->read_child, + local->cont.symlink.buf.ia_gfid); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index acc96697f..93b645295 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -796,7 +796,8 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type) + afr_transaction_type txn_type, + uuid_t gfid) { afr_private_t *priv = NULL; int read_child = -1; @@ -855,7 +856,8 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, priv->child_count, prev_read_child, config_read_child, - sources); + sources, + priv->hash_mode, gfid); out: afr_matrix_cleanup (pending_matrix, priv->child_count); gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 627a2115a..2efc1116d 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -45,5 +45,6 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type txn_type); + afr_transaction_type txn_type, + uuid_t gfid); #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 8e94d5497..b7ba26197 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -149,6 +149,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); + if (read_subvol) { index = xlator_subvolume_index (this, read_subvol); if (index == -1) { @@ -237,6 +240,8 @@ init (xlator_t *this) } } + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); + priv->favorite_child = -1; GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); if (fav_child) { @@ -494,6 +499,15 @@ struct volume_options options[] = { { .key = {"read-subvolume" }, .type = GF_OPTION_TYPE_XLATOR }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "0", + .description = "0 = first responder, " + "1 = hash by GFID (all clients use same subvolume), " + "2 = hash by GFID and client PID", + }, { .key = {"favorite-child"}, .type = GF_OPTION_TYPE_XLATOR }, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 27cb83a57..a1a30562b 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -130,6 +130,7 @@ typedef struct _afr_private { gf_boolean_t entry_change_log; /* on/off */ int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ @@ -936,12 +937,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count); int afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources); + int32_t config_read_child, int32_t *sources, + unsigned int hmode, uuid_t gfid); void afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child); + int32_t config_read_child, uuid_t gfid); int32_t afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, -- cgit