diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2014-06-17 13:42:45 +0000 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-07-12 09:20:52 -0700 |
commit | 99685f18f190a73f2a46478cac0b09f4c59834b1 (patch) | |
tree | f5e787ace3038b97876425c5397e91c0a37df04d /xlators | |
parent | d5ec66032ff96d7d417b5838a6bd1a047d52204c (diff) |
dht: support heterogeneous brick sizes
Calculation of layouts now considers the size of each brick, so that
smaller bricks don't get an "unfair" share of allocations and start
returning ENOSPC while the larger bricks still have plenty of space.
The observation has been made that some clients might get ENOTCONN when
trying to fetch disk-size information, and end up calculating layouts
differently. The following meta-observations can be made.
(1) This scenario is extremely unlikely in configurations with AFR.
(2) The most likely consequence of this scenario is that some files will
be placed sub-optimally by the client with the obsolete (non-weighted)
layout. They'll still be found anyway, so this isn't a show stopper.
(3) Without this patch it's *guaranteed* that some files will be placed
sub-optimally, because any layout that fails to account for brick sizes
is sub-optimal.
(4) We shouldn't be doing fix-layout from two nodes simultaneously
anyway. That's inefficient at best. Any instances of such behavior are
separate bugs, which should be fixed separately.
(5) In the most extreme edge case, two nodes doing weighted and
non-weighted layout fixes could race and end up creating an internally
inconsistent layout. This condition is still transient; it will be
detected and repaired automatically the next time anyone fetches the
layout. (If it's not that's also a preexisting bug that can show up in
other contexts.)
In conclusion, it's not the purpose of this patch to fix bugs elsewhere
in DHT. Its purpose is to make life incrementally better for users who
add new hardware with larger disks etc. than the older equipment. It's
only one part of an ongoing process to improve layout management and
repair, all the way up to support for multiple hash rings or tiering.
Change-Id: I05eb6f9eface9cdaf8622e0260c8c7f29020447f
BUG: 1114680
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.org/8093
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 4 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 27 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 136 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 13 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 4 |
5 files changed, 153 insertions, 31 deletions
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 5dee622a2a1..54f885d18b0 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -208,6 +208,7 @@ struct dht_du { double avail_inodes; uint64_t avail_space; uint32_t log; + uint32_t chunks; }; typedef struct dht_du dht_du_t; @@ -315,6 +316,9 @@ struct dht_conf { char *xattr_name; char *link_xattr_name; char *wild_xattr_name; + + /* Support size-weighted rebalancing (heterogeneous bricks). */ + gf_boolean_t do_weighting; gf_boolean_t randomize_by_gfid; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 8664f550ba2..a2dc43c32aa 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -37,6 +37,8 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, double percent = 0; double percent_inodes = 0; uint64_t bytes = 0; + uint32_t bpc; /* blocks per chunk */ + uint32_t chunks = 0; conf = this->private; prev = cookie; @@ -50,17 +52,28 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (statvfs && statvfs->f_blocks) { percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; bytes = (statvfs->f_bavail * statvfs->f_frsize); + /* + * A 32-bit count of 1MB chunks allows a maximum brick size of + * ~4PB. It's possible that we could see a single local FS + * bigger than that some day, but this code is likely to be + * irrelevant by then. Meanwhile, it's more important to keep + * the chunk size small so the layout-calculation code that + * uses this value can be tested on normal machines. + */ + bpc = (1 << 20) / statvfs->f_bsize; + chunks = (statvfs->f_blocks + bpc - 1) / bpc; } if (statvfs && statvfs->f_files) { percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; } else { - /* set percent inodes to 100 for dynamically allocated inode filesystems - this logic holds good so that, distribute has nothing to worry about - total inodes rather let the 'create()' to be scheduled on the hashed - subvol regardless of the total inodes. since we have no awareness on - loosing inodes this logic fits well - */ + /* + * Set percent inodes to 100 for dynamically allocated inode + * filesystems. The rationale is that distribute need not + * worry about total inodes; rather, let the 'create()' be + * scheduled on the hashed subvol regardless of the total + * inodes. + */ percent_inodes = 100; } @@ -71,6 +84,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, conf->du_stats[i].avail_percent = percent; conf->du_stats[i].avail_space = bytes; conf->du_stats[i].avail_inodes = percent_inodes; + conf->du_stats[i].chunks = chunks; gf_msg_debug (this->name, 0, "subvolume '%s': avail_percent " "is: %.2f and avail_space " @@ -80,6 +94,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, conf->du_stats[i].avail_percent, conf->du_stats[i].avail_space, conf->du_stats[i].avail_inodes); + break; /* no point in looping further */ } } UNLOCK (&conf->subvolume_lock); diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index f476e44e0c1..a92dba89d2b 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -20,11 +20,11 @@ #include "dht-messages.h" #include "glusterfs-acl.h" -#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ +#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \ layout->list[i].start = srt; \ layout->list[i].stop = srt + chunk - 1; \ \ - gf_msg_trace (this->name, 0, \ + gf_msg_trace (this->name, 0, \ "gave fix: %u - %u on %s for %s", \ layout->list[i].start, \ layout->list[i].stop, \ @@ -952,6 +952,18 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, new_layout->list[i].xlator = layout->list[i].xlator; } + if (priv->du_stats) { + for (i = 0; i < priv->subvolume_cnt; ++i) { + gf_log (this->name, GF_LOG_INFO, + "subvolume %d (%s): %u chunks", i, + priv->subvolumes[i]->name, + priv->du_stats[i].chunks); + } + } + else { + gf_log (this->name, GF_LOG_WARNING, "no du stats ?!?"); + } + /* First give it a layout as though it is a new directory. This ensures rotation to kick in */ dht_layout_sort_volname (new_layout); @@ -976,6 +988,32 @@ done: } +/* + * Having to call this 2x for each entry in the layout is pretty horrible, but + * that's what all of this layout-sorting nonsense gets us. + */ +uint32_t +dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child) +{ + dht_conf_t *priv = parent->private; + xlator_list_t *trav; + uint32_t index = 0; + + if (!priv->du_stats) { + return 0; + } + + for (trav = parent->children; trav; trav = trav->next) { + if (trav->xlator == child) { + return priv->du_stats[index].chunks; + } + ++index; + } + + return 0; +} + + void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) @@ -984,44 +1022,92 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, uint32_t chunk = 0; int i = 0; uint32_t start = 0; - int cnt = 0; + int bricks_to_use = 0; int err = 0; int start_subvol = 0; + uint32_t curr_size; + uint32_t total_size = 0; + int real_i; + dht_conf_t *priv; + gf_boolean_t weight_by_size; + int bricks_used = 0; this = frame->this; + priv = this->private; + weight_by_size = priv->do_weighting; + + bricks_to_use = dht_get_layout_count (this, layout, 1); + GF_ASSERT (bricks_to_use > 0); - cnt = dht_get_layout_count (this, layout, 1); + bricks_used = 0; + for (i = 0; i < layout->cnt; ++i) { + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + curr_size = dht_get_chunks_from_xl (this, + layout->list[i].xlator); + if (!curr_size) { + weight_by_size = _gf_false; + break; + } + total_size += curr_size; + if (++bricks_used >= bricks_to_use) { + break; + } + } - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); + if (weight_by_size) { + /* We know total_size is not zero. */ + chunk = ((unsigned long) 0xffffffff) / total_size; + gf_log (this->name, GF_LOG_INFO, + "chunk size = 0xffffffff / %u = 0x%x", + total_size, chunk); + } + else { + chunk = ((unsigned long) 0xffffffff) / bricks_used; + } start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); /* clear out the range, as we are re-computing here */ DHT_RESET_LAYOUT_RANGE (layout); - for (i = start_subvol; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; - } - start += chunk; - } - } - for (i = 0; i < start_subvol; i++) { + /* + * OK, what's this "real_i" stuff about? This used to be two loops - + * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. + * That way is practically an open invitation to bugs when only one + * of the loops is updated. Using real_i and modulo operators to make + * it one loop avoids this problem. Remember, folks: it's everyone's + * responsibility to help stamp out copy/paste abuse. + */ + bricks_used = 0; + for (real_i = 0; real_i < layout->cnt; real_i++) { + i = (real_i + start_subvol) % layout->cnt; err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; + if ((err != -1) && (err != ENOENT)) { + continue; + } + if (weight_by_size) { + curr_size = dht_get_chunks_from_xl (this, + layout->list[i].xlator); + if (!curr_size) { + continue; } - start += chunk; } + else { + curr_size = 1; + } + gf_log (this->name, GF_LOG_INFO, + "assigning range size 0x%x to %s", chunk * curr_size, + layout->list[i].xlator->name); + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size, + loc->path); + if (++bricks_used >= bricks_to_use) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += (chunk * curr_size); } done: diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 4748d2a4f61..f8faecf6870 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -419,6 +419,9 @@ dht_reconfigure (xlator_t *this, dict_t *options) dht_init_regex (this, options, "extra-hash-regex", &conf->extra_regex, &conf->extra_regex_valid); + GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options, + bool, out); + ret = 0; out: return ret; @@ -658,6 +661,8 @@ dht_init (xlator_t *this) goto err; } + GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err); + this->private = conf; return 0; @@ -790,6 +795,14 @@ struct volume_options options[] = { "below it." }, + { .key = {"weighted-rebalance"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "When enabled, files will be allocated to bricks " + "with a probability proportional to their size. Otherwise, all " + "bricks will have the same probability (legacy behavior)." + }, + /* NUFA option */ { .key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 92ab3d1a3a3..5358d52a43a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -420,6 +420,10 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 3, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.weighted-rebalance", + .voltype = "cluster/distribute", + .op_version = GD_OP_VERSION_3_6_0, + }, /* Switch xlator options (Distribute special case) */ { .key = "cluster.switch", |