diff options
-rwxr-xr-x | tests/bugs/bug-902610.t | 44 | ||||
-rwxr-xr-x | tests/features/weighted-rebalance.t | 91 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 4 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 27 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 136 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 13 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 4 |
7 files changed, 269 insertions, 50 deletions
diff --git a/tests/bugs/bug-902610.t b/tests/bugs/bug-902610.t index 00ba03adfce..3f26fdde970 100755 --- a/tests/bugs/bug-902610.t +++ b/tests/bugs/bug-902610.t @@ -8,27 +8,33 @@ cleanup; function get_layout() { layout1=`getfattr -n trusted.glusterfs.dht -e hex $1 2>&1|grep dht |cut -d = -f2` + layout1_s=$(echo $layout1 | cut -c 19-26) + layout1_e=$(echo $layout1 | cut -c 27-34) + #echo "layout1 from $layout1_s to $layout1_e" > /dev/tty layout2=`getfattr -n trusted.glusterfs.dht -e hex $2 2>&1|grep dht |cut -d = -f2` + layout2_s=$(echo $layout2 | cut -c 19-26) + layout2_e=$(echo $layout2 | cut -c 27-34) + #echo "layout2 from $layout2_s to $layout2_e" > /dev/tty + + if [ x"$layout2_s" = x"00000000" ]; then + # Reverse so we only have the real logic in one place. + tmp_s=$layout1_s + tmp_e=$layout1_e + layout1_s=$layout2_s + layout1_e=$layout2_e + layout2_s=$tmp_s + layout2_e=$tmp_e + fi + + # Figure out where the join point is. + target=$(python -c "print '%08x' % (0x$layout1_e + 1)") + #echo "target for layout2 = $target" > /dev/tty + + # The second layout should cover everything that the first doesn't. + if [ x"$layout2_s" = x"$target" -a x"$layout2_e" = x"ffffffff" ]; then + return 0 + fi - if [ $layout1 == "0x0000000100000000000000007ffffffe" ] - then - if [ $layout2 == "0x00000001000000007fffffffffffffff" ] - then - return 0 - else - return 1 - fi - fi - - if [ $layout2 == "0x0000000100000000000000007ffffffe" ] - then - if [ $layout1 == "0x00000001000000007fffffffffffffff" ] - then - return 0 - else - return 1 - fi - fi return 1 } diff --git a/tests/features/weighted-rebalance.t b/tests/features/weighted-rebalance.t new file mode 100755 index 00000000000..a5e746970ae --- /dev/null +++ b/tests/features/weighted-rebalance.t @@ -0,0 +1,91 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../dht.rc + +NFILES=1000 + +touch_files () { + for i in $(seq 1 $NFILES); do + touch $(printf $M0/dir/file%02d $i) 2> /dev/null + done +} + +count_files () { + found=0 + for i in $(seq 1 $NFILES); do + if [ -f $(printf $1/dir/file%02d $i) ]; then + found=$((found+1)) + fi + done + echo $found +} + +wait_for_rebalance () { + while true; do + rebalance_completed + if [ $? -eq 1 ]; then + sleep 1 + else + break + fi + done +} + +get_xattr () { + cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht" + $cmd $1 | od -tx1 -An | tr -d ' ' +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST mkdir ${B0}/${V0}{1,2} + +TEST truncate --size $((40*1024*1024)) ${B0}/disk1 +TEST mkfs.xfs -f -i size=512 ${B0}/disk1 +TEST mount -o loop ${B0}/disk1 ${B0}/${V0}1 + +TEST truncate --size $((80*1024*1024)) ${B0}/disk2 +TEST mkfs.xfs -f -i size=512 ${B0}/disk2 +TEST mount -o loop ${B0}/disk2 ${B0}/${V0}2 + +TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +# Create some files for later tests. +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 +TEST mkdir $M0/dir +TEST touch_files +TEST umount $M0 + +# Check that the larger brick got more of the files. +nfiles=$(count_files ${B0}/${V0}2) +echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty +TEST [ $nfiles -ge 580 ] + +# Turn off the size-weighted rebalance. +TEST $CLI volume set $V0 cluster.weighted-rebalance off + +# Rebalance again and check that the distribution is even again. +TEST $CLI volume rebalance $V0 start force +TEST wait_for_rebalance +nfiles=$(count_files ${B0}/${V0}2) +echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty +TEST [ $nfiles -le 580 ] + +exit 0 + +$CLI volume stop $V0 +umount ${B0}/${V0}{1,2} +rm -f ${B0}/disk{1,2} + +cleanup diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 5dee622a2a1..54f885d18b0 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -208,6 +208,7 @@ struct dht_du { double avail_inodes; uint64_t avail_space; uint32_t log; + uint32_t chunks; }; typedef struct dht_du dht_du_t; @@ -315,6 +316,9 @@ struct dht_conf { char *xattr_name; char *link_xattr_name; char *wild_xattr_name; + + /* Support size-weighted rebalancing (heterogeneous bricks). */ + gf_boolean_t do_weighting; gf_boolean_t randomize_by_gfid; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 8664f550ba2..a2dc43c32aa 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -37,6 +37,8 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, double percent = 0; double percent_inodes = 0; uint64_t bytes = 0; + uint32_t bpc; /* blocks per chunk */ + uint32_t chunks = 0; conf = this->private; prev = cookie; @@ -50,17 +52,28 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (statvfs && statvfs->f_blocks) { percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; bytes = (statvfs->f_bavail * statvfs->f_frsize); + /* + * A 32-bit count of 1MB chunks allows a maximum brick size of + * ~4PB. It's possible that we could see a single local FS + * bigger than that some day, but this code is likely to be + * irrelevant by then. Meanwhile, it's more important to keep + * the chunk size small so the layout-calculation code that + * uses this value can be tested on normal machines. + */ + bpc = (1 << 20) / statvfs->f_bsize; + chunks = (statvfs->f_blocks + bpc - 1) / bpc; } if (statvfs && statvfs->f_files) { percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; } else { - /* set percent inodes to 100 for dynamically allocated inode filesystems - this logic holds good so that, distribute has nothing to worry about - total inodes rather let the 'create()' to be scheduled on the hashed - subvol regardless of the total inodes. since we have no awareness on - loosing inodes this logic fits well - */ + /* + * Set percent inodes to 100 for dynamically allocated inode + * filesystems. The rationale is that distribute need not + * worry about total inodes; rather, let the 'create()' be + * scheduled on the hashed subvol regardless of the total + * inodes. + */ percent_inodes = 100; } @@ -71,6 +84,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, conf->du_stats[i].avail_percent = percent; conf->du_stats[i].avail_space = bytes; conf->du_stats[i].avail_inodes = percent_inodes; + conf->du_stats[i].chunks = chunks; gf_msg_debug (this->name, 0, "subvolume '%s': avail_percent " "is: %.2f and avail_space " @@ -80,6 +94,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, conf->du_stats[i].avail_percent, conf->du_stats[i].avail_space, conf->du_stats[i].avail_inodes); + break; /* no point in looping further */ } } UNLOCK (&conf->subvolume_lock); diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index f476e44e0c1..a92dba89d2b 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -20,11 +20,11 @@ #include "dht-messages.h" #include "glusterfs-acl.h" -#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ +#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \ layout->list[i].start = srt; \ layout->list[i].stop = srt + chunk - 1; \ \ - gf_msg_trace (this->name, 0, \ + gf_msg_trace (this->name, 0, \ "gave fix: %u - %u on %s for %s", \ layout->list[i].start, \ layout->list[i].stop, \ @@ -952,6 +952,18 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, new_layout->list[i].xlator = layout->list[i].xlator; } + if (priv->du_stats) { + for (i = 0; i < priv->subvolume_cnt; ++i) { + gf_log (this->name, GF_LOG_INFO, + "subvolume %d (%s): %u chunks", i, + priv->subvolumes[i]->name, + priv->du_stats[i].chunks); + } + } + else { + gf_log (this->name, GF_LOG_WARNING, "no du stats ?!?"); + } + /* First give it a layout as though it is a new directory. This ensures rotation to kick in */ dht_layout_sort_volname (new_layout); @@ -976,6 +988,32 @@ done: } +/* + * Having to call this 2x for each entry in the layout is pretty horrible, but + * that's what all of this layout-sorting nonsense gets us. + */ +uint32_t +dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child) +{ + dht_conf_t *priv = parent->private; + xlator_list_t *trav; + uint32_t index = 0; + + if (!priv->du_stats) { + return 0; + } + + for (trav = parent->children; trav; trav = trav->next) { + if (trav->xlator == child) { + return priv->du_stats[index].chunks; + } + ++index; + } + + return 0; +} + + void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) @@ -984,44 +1022,92 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, uint32_t chunk = 0; int i = 0; uint32_t start = 0; - int cnt = 0; + int bricks_to_use = 0; int err = 0; int start_subvol = 0; + uint32_t curr_size; + uint32_t total_size = 0; + int real_i; + dht_conf_t *priv; + gf_boolean_t weight_by_size; + int bricks_used = 0; this = frame->this; + priv = this->private; + weight_by_size = priv->do_weighting; + + bricks_to_use = dht_get_layout_count (this, layout, 1); + GF_ASSERT (bricks_to_use > 0); - cnt = dht_get_layout_count (this, layout, 1); + bricks_used = 0; + for (i = 0; i < layout->cnt; ++i) { + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + curr_size = dht_get_chunks_from_xl (this, + layout->list[i].xlator); + if (!curr_size) { + weight_by_size = _gf_false; + break; + } + total_size += curr_size; + if (++bricks_used >= bricks_to_use) { + break; + } + } - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); + if (weight_by_size) { + /* We know total_size is not zero. */ + chunk = ((unsigned long) 0xffffffff) / total_size; + gf_log (this->name, GF_LOG_INFO, + "chunk size = 0xffffffff / %u = 0x%x", + total_size, chunk); + } + else { + chunk = ((unsigned long) 0xffffffff) / bricks_used; + } start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); /* clear out the range, as we are re-computing here */ DHT_RESET_LAYOUT_RANGE (layout); - for (i = start_subvol; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; - } - start += chunk; - } - } - for (i = 0; i < start_subvol; i++) { + /* + * OK, what's this "real_i" stuff about? This used to be two loops - + * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. + * That way is practically an open invitation to bugs when only one + * of the loops is updated. Using real_i and modulo operators to make + * it one loop avoids this problem. Remember, folks: it's everyone's + * responsibility to help stamp out copy/paste abuse. + */ + bricks_used = 0; + for (real_i = 0; real_i < layout->cnt; real_i++) { + i = (real_i + start_subvol) % layout->cnt; err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; + if ((err != -1) && (err != ENOENT)) { + continue; + } + if (weight_by_size) { + curr_size = dht_get_chunks_from_xl (this, + layout->list[i].xlator); + if (!curr_size) { + continue; } - start += chunk; } + else { + curr_size = 1; + } + gf_log (this->name, GF_LOG_INFO, + "assigning range size 0x%x to %s", chunk * curr_size, + layout->list[i].xlator->name); + DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size, + loc->path); + if (++bricks_used >= bricks_to_use) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += (chunk * curr_size); } done: diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 4748d2a4f61..f8faecf6870 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -419,6 +419,9 @@ dht_reconfigure (xlator_t *this, dict_t *options) dht_init_regex (this, options, "extra-hash-regex", &conf->extra_regex, &conf->extra_regex_valid); + GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options, + bool, out); + ret = 0; out: return ret; @@ -658,6 +661,8 @@ dht_init (xlator_t *this) goto err; } + GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err); + this->private = conf; return 0; @@ -790,6 +795,14 @@ struct volume_options options[] = { "below it." }, + { .key = {"weighted-rebalance"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "When enabled, files will be allocated to bricks " + "with a probability proportional to their size. Otherwise, all " + "bricks will have the same probability (legacy behavior)." + }, + /* NUFA option */ { .key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 92ab3d1a3a3..5358d52a43a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -420,6 +420,10 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 3, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.weighted-rebalance", + .voltype = "cluster/distribute", + .op_version = GD_OP_VERSION_3_6_0, + }, /* Switch xlator options (Distribute special case) */ { .key = "cluster.switch", |