diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2012-03-12 09:32:40 -0400 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2012-05-31 17:29:01 -0700 | 
| commit | ddc044bfa2840981de4003c3b9efcac84387dc2b (patch) | |
| tree | a83d476702cac7ecc7ae59057c368f622a51af4c | |
| parent | e066a5fea7bdaa5da78e49c9a5bf344af2f33d3c (diff) | |
replicate: add hashed read-child method.
Both the first-to-respond method and the round-robin method are susceptible
to clients repeatedly choosing the same servers across a series of opens,
creating hot spots.  Also, the code to handle a replica being down will
ignore both methods and just choose the first remaining (which is not an
issue for two-way but can be otherwise).  The hashed method more reliably
avoids such hot spots.  There are three values/modes.
0: use the old (broken) methods.
1: select a read-child based on a hash of the file's GFID, so all clients
   will choose the same subvolume for a file (ensuring maximum consistency)
   but will distribute load for a set of files.
2: select a read-child based on a hash of the file's GFID plus the client's
   PID, so different children will distribute load even for one file.
Mode 2 will probably be optimal for most cases.  Using response time when we
open the file is problematic, both because a single sample might not have
been representative even then and because load might have shifted in the
hours or days since (for long-lived files).  Trying to use more current load
information can lead to "herd following" behavior which is just as bad.
Pseudo-random distribution is likely to be the best we can reasonably do,
just as it is for DHT.
Change-Id: I798c2760411eacf32e82a85f03bb7b08a4a49461
BUG: 802513
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.com/2926
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 67 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 15 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 6 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 1 | 
7 files changed, 94 insertions, 18 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 300ab92efaf..21a2be3dd6f 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -546,6 +546,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources,          gf_boolean_t             success_child = _gf_false;          gf_boolean_t             source        = _gf_false; +        if (child < 0) { +                return _gf_false; +        } +          GF_ASSERT (success_children);          GF_ASSERT (child_count > 0); @@ -562,13 +566,44 @@ out:          return (success_child && source);  } +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, +                unsigned int hmode, uuid_t gfid) +{ +        uuid_t  gfid_copy = {0,}; + +        if (!hmode) { +                return -1; +        } + +        if (gfid) { +               uuid_copy(gfid_copy,gfid); +        } +        if (hmode > 1) { +                /* +                 * Why getpid?  Because it's one of the cheapest calls +                 * available - faster than gethostname etc. - and returns a +                 * constant-length value that's sure to be shorter than a UUID. +                 * It's still very unlikely to be the same across clients, so +                 * it still provides good mixing.  We're not trying for +                 * perfection here.  All we need is a low probability that +                 * multiple clients won't converge on the same subvolume. +                 */ +                *((pid_t *)gfid_copy) = getpid(); +        } + +        return SuperFastHash((char *)gfid_copy, +                             sizeof(gfid_copy)) % child_count; +} +  /* If sources is NULL the xattrs are assumed to be of source for all   * success_children.   */  int -afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, -                                   int32_t prev_read_child, -                                   int32_t config_read_child, int32_t *sources) +afr_select_read_child_from_policy (int32_t *success_children, +                                   int32_t child_count, int32_t prev_read_child, +                                   int32_t config_read_child, int32_t *sources, +                                   unsigned int hmode, uuid_t gfid)  {          int32_t                  read_child   = -1;          int                      i            = 0; @@ -585,6 +620,13 @@ afr_select_read_child_from_policy (int32_t *success_children, int32_t child_coun                                 read_child))                  goto out; +        read_child = afr_hash_child (success_children, child_count, +                                     hmode, gfid); +        if (afr_is_read_child (success_children, sources, child_count, +                               read_child)) { +                goto out; +        } +          for (i = 0; i < child_count; i++) {                  read_child = success_children[i];                  if (read_child < 0) @@ -604,7 +646,7 @@ out:  void  afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,                                int32_t *fresh_children, int32_t prev_read_child, -                              int32_t config_read_child) +                              int32_t config_read_child, uuid_t gfid)  {          int                      read_child = -1;          afr_private_t            *priv = NULL; @@ -614,7 +656,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,                                                          priv->child_count,                                                          prev_read_child,                                                          config_read_child, -                                                        NULL); +                                                        NULL, +                                                        priv->hash_mode, gfid);          if (read_child >= 0)                  afr_inode_set_read_ctx (this, inode, read_child,                                          fresh_children); @@ -1271,6 +1314,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,          dict_t                  **xattrs       = NULL;          int32_t                 *success_children = NULL;          afr_transaction_type    type           = AFR_METADATA_TRANSACTION; +        uuid_t                  *gfid          = NULL;          GF_ASSERT (local);          GF_ASSERT (this); @@ -1284,8 +1328,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,          ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;          type = afr_transaction_type_get (ia_type);          xattrs = local->cont.lookup.xattrs; +        gfid = &local->cont.lookup.buf.ia_gfid;          source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, -                                                           type); +                                                           type, *gfid);          if (source < 0) {                  gf_log (this->name, GF_LOG_DEBUG, "failed to select source "                          "for %s", local->loc.path); @@ -2131,8 +2176,14 @@ afr_lookup (call_frame_t *frame, xlator_t *this,          } else {                  LOCK (&priv->read_child_lock);                  { -                        local->read_child_index = (++priv->read_child_rr) -                                % (priv->child_count); +                        if (priv->hash_mode) { +                                local->read_child_index = -1; +                        } +                        else { +                                local->read_child_index = +                                        (++priv->read_child_rr) % +                                        (priv->child_count); +                        }                  }                  UNLOCK (&priv->read_child_lock);                  local->cont.lookup.fresh_lookup = _gf_true; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 9f2b975df6f..0b804bef580 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -196,7 +196,8 @@ unlock:                  afr_set_read_ctx_from_policy (this, inode,                                                local->fresh_children,                                                local->read_child_index, -                                              priv->read_child); +                                              priv->read_child, +                                              local->cont.create.buf.ia_gfid);                  local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this); @@ -429,7 +430,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  afr_set_read_ctx_from_policy (this, inode,                                                local->fresh_children,                                                local->read_child_index, -                                              priv->read_child); +                                              priv->read_child, +                                              local->cont.mknod.buf.ia_gfid);                  local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this); @@ -657,7 +659,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  afr_set_read_ctx_from_policy (this, inode,                                                local->fresh_children,                                                local->read_child_index, -                                              priv->read_child); +                                              priv->read_child, +                                              local->cont.mkdir.buf.ia_gfid);                  local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this); @@ -887,7 +890,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  afr_set_read_ctx_from_policy (this, inode,                                                local->fresh_children,                                                local->read_child_index, -                                              priv->read_child); +                                              priv->read_child, +                                              local->cont.link.buf.ia_gfid);                  local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this); @@ -1110,7 +1114,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  afr_set_read_ctx_from_policy (this, inode,                                                local->fresh_children,                                                local->read_child_index, -                                              priv->read_child); +                                              priv->read_child, +                                              local->cont.symlink.buf.ia_gfid);                  local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index acc96697ff4..93b64529543 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -796,7 +796,8 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)  int  afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,                                            dict_t **xattr, -                                          afr_transaction_type txn_type) +                                          afr_transaction_type txn_type, +                                          uuid_t gfid)  {          afr_private_t            *priv      = NULL;          int                      read_child = -1; @@ -855,7 +856,8 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,                                                          priv->child_count,                                                          prev_read_child,                                                          config_read_child, -                                                        sources); +                                                        sources, +                                                        priv->hash_mode, gfid);  out:          afr_matrix_cleanup (pending_matrix, priv->child_count);          gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 627a2115a09..2efc1116d33 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -45,5 +45,6 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode);  int  afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,                                            dict_t **xattr, -                                          afr_transaction_type txn_type); +                                          afr_transaction_type txn_type, +                                          uuid_t gfid);  #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 8e94d549737..b7ba2619711 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -149,6 +149,9 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); +        GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, +                          options, uint32, out); +          if (read_subvol) {                  index = xlator_subvolume_index (this, read_subvol);                  if (index == -1) { @@ -237,6 +240,8 @@ init (xlator_t *this)                  }          } +        GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); +          priv->favorite_child = -1;          GF_OPTION_INIT ("favorite-child", fav_child, xlator, out);          if (fav_child) { @@ -494,6 +499,15 @@ struct volume_options options[] = {          { .key  = {"read-subvolume" },            .type = GF_OPTION_TYPE_XLATOR          }, +        { .key = {"read-hash-mode" }, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, +          .max = 2, +          .default_value = "0", +          .description = "0 = first responder, " +                         "1 = hash by GFID (all clients use same subvolume), " +                         "2 = hash by GFID and client PID", +        },          { .key  = {"favorite-child"},            .type = GF_OPTION_TYPE_XLATOR          }, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 27cb83a5791..a1a30562bf1 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -130,6 +130,7 @@ typedef struct _afr_private {          gf_boolean_t entry_change_log;      /* on/off */          int read_child;               /* read-subvolume */ +        unsigned int hash_mode;       /* for when read_child is not set */          int favorite_child;  /* subvolume to be preferred in resolving                                           split-brain cases */ @@ -936,12 +937,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count);  int  afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,                                     int32_t prev_read_child, -                                   int32_t config_read_child, int32_t *sources); +                                   int32_t config_read_child, int32_t *sources, +                                   unsigned int hmode, uuid_t gfid);  void  afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,                                int32_t *fresh_children, int32_t prev_read_child, -                              int32_t config_read_child); +                              int32_t config_read_child, uuid_t gfid);  int32_t  afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index fdbc8c4945d..6acf3bbf6dd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -121,6 +121,7 @@ static struct volopt_map_entry glusterd_volopt_map[] = {          {"cluster.entry-change-log",             "cluster/replicate",  NULL, NULL, NO_DOC, 0     },          {"cluster.read-subvolume",               "cluster/replicate",  NULL, NULL, NO_DOC, 0    }, +        {"cluster.read-hash-mode",               "cluster/replicate",  NULL, NULL, NO_DOC, 0},          {"cluster.background-self-heal-count",   "cluster/replicate",  NULL, NULL, NO_DOC, 0    },          {"cluster.metadata-self-heal",           "cluster/replicate",  NULL, NULL, NO_DOC, 0     },          {"cluster.data-self-heal",               "cluster/replicate",  NULL, NULL, NO_DOC, 0     },  | 
