diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 101 | 
1 files changed, 80 insertions, 21 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 96f13ce2cee..6863bd02c50 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1597,19 +1597,18 @@ out:  }  int -afr_least_pending_reads_child(afr_private_t *priv) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)  {      int i = 0; -    int child = 0; +    int child = -1;      int64_t read_iter = -1;      int64_t pending_read = -1; -    pending_read = GF_ATOMIC_GET(priv->pending_reads[0]); -    for (i = 1; i < priv->child_count; i++) { -        if (AFR_IS_ARBITER_BRICK(priv, i)) +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])              continue;          read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); -        if (read_iter < pending_read) { +        if (child == -1 || read_iter < pending_read) {              pending_read = read_iter;              child = i;          } @@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv)      return child;  } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ +    int32_t i = 0; +    int child = -1; + +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || +            priv->child_latency[i] < 0) +            continue; + +        if (child == -1 || +            priv->child_latency[i] < priv->child_latency[child]) { +            child = i; +        } +    } +    return child; +} + +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, +                                            unsigned char *readable) +{ +    int32_t i = 0; +    int child = -1; +    int64_t pending_read = 0; +    int64_t latency = -1; +    int64_t least_latency = -1; + +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || +            priv->child_latency[i] < 0) +            continue; + +        pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); +        latency = (pending_read + 1) * priv->child_latency[i]; + +        if (child == -1 || latency < least_latency) { +            least_latency = latency; +            child = i; +        } +    } +    return child; +} +  int -afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, +               unsigned char *readable)  {      uuid_t gfid_copy = {          0, @@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)      int child = -1;      switch (priv->hash_mode) { -        case 0: +        case AFR_READ_POLICY_FIRST_UP:              break; -        case 1: +        case AFR_READ_POLICY_GFID_HASH:              gf_uuid_copy(gfid_copy, args->gfid);              child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %                      priv->child_count;              break; -        case 2: +        case AFR_READ_POLICY_GFID_PID_HASH:              if (args->ia_type != IA_IFDIR) {                  /*                   * Why getpid?  Because it's one of the cheapest calls @@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)              child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %                      priv->child_count;              break; -        case 3: -            child = afr_least_pending_reads_child(priv); +        case AFR_READ_POLICY_LESS_LOAD: +            child = afr_least_pending_reads_child(priv, readable); +            break; +        case AFR_READ_POLICY_LEAST_LATENCY: +            child = afr_least_latency_child(priv, readable); +            break; +        case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: +            child = afr_least_latency_times_pending_reads_child(priv, readable);              break;      } @@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,      }      /* second preference - use hashed mode */ -    read_subvol = afr_hash_child(&local_args, priv); +    read_subvol = afr_hash_child(&local_args, priv, readable);      if (read_subvol >= 0 && readable[read_subvol])          return read_subvol; @@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,       * want to set the child_latency to MAX to indicate       * the child needs ping data to be available before doing child-up       */ -    if (child_latency_msec < 0 && priv->halo_enabled) { +    if (!priv->halo_enabled) +        goto out; + +    if (child_latency_msec < 0) {          /*set to INT64_MAX-1 so that it is found for best_down_child*/          priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;      } @@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,                       "up_children (%d) > halo_max_replicas (%d)",                       worst_up_child, up_children, priv->halo_max_replicas);      } - +out:      if (up_children == 1) {          gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,                 "Subvolume '%s' came back up; " @@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,       * as we want it to be up to date if we are going to       * begin using it synchronously.       */ -    if (up_children < priv->halo_min_replicas) { +    if (priv->halo_enabled && up_children < priv->halo_min_replicas) {          best_down_child = find_best_down_child(this);          if (best_down_child >= 0) {              gf_msg_debug(this->name, 0, @@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,              *up_child = best_down_child;          }      } -      for (i = 0; i < priv->child_count; i++)          if (priv->child_up[i] == 0)              down_children++; @@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)      had_quorum = priv->quorum_count &&                   afr_has_quorum(priv->child_up, this, NULL); -    if (priv->halo_enabled) { -        halo_max_latency_msec = afr_get_halo_latency(this); +    if (event == GF_EVENT_CHILD_PING) { +        child_latency_msec = (int64_t)(uintptr_t)data2; +        if (priv->halo_enabled) { +            halo_max_latency_msec = afr_get_halo_latency(this); -        if (event == GF_EVENT_CHILD_PING) {              /* Calculates the child latency and sets event               */ -            child_latency_msec = (int64_t)(uintptr_t)data2;              LOCK(&priv->lock);              {                  __afr_handle_ping_event(this, child_xlator, idx, @@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                                          child_latency_msec);              }              UNLOCK(&priv->lock); +        } else { +            LOCK(&priv->lock); +            { +                priv->child_latency[idx] = child_latency_msec; +            } +            UNLOCK(&priv->lock);          }      }  | 
