diff options
| -rwxr-xr-x | tests/bugs/bug-902610.t | 44 | ||||
| -rwxr-xr-x | tests/features/weighted-rebalance.t | 91 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 4 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 27 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 136 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 13 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 4 | 
7 files changed, 269 insertions, 50 deletions
diff --git a/tests/bugs/bug-902610.t b/tests/bugs/bug-902610.t index 00ba03adfce..3f26fdde970 100755 --- a/tests/bugs/bug-902610.t +++ b/tests/bugs/bug-902610.t @@ -8,27 +8,33 @@ cleanup;  function get_layout()  {          layout1=`getfattr -n trusted.glusterfs.dht -e hex $1 2>&1|grep dht |cut -d = -f2` +	layout1_s=$(echo $layout1 | cut -c 19-26) +	layout1_e=$(echo $layout1 | cut -c 27-34) +	#echo "layout1 from $layout1_s to $layout1_e" > /dev/tty          layout2=`getfattr -n trusted.glusterfs.dht -e hex $2 2>&1|grep dht |cut -d = -f2` +	layout2_s=$(echo $layout2 | cut -c 19-26) +	layout2_e=$(echo $layout2 | cut -c 27-34) +	#echo "layout2 from $layout2_s to $layout2_e" > /dev/tty + +	if [ x"$layout2_s" = x"00000000" ]; then +		# Reverse so we only have the real logic in one place. +		tmp_s=$layout1_s +		tmp_e=$layout1_e +		layout1_s=$layout2_s +		layout1_e=$layout2_e +		layout2_s=$tmp_s +		layout2_e=$tmp_e +	fi + +	# Figure out where the join point is. +	target=$(python -c "print '%08x' % (0x$layout1_e + 1)") +	#echo "target for layout2 = $target" > /dev/tty + +	# The second layout should cover everything that the first doesn't. +	if [ x"$layout2_s" = x"$target" -a x"$layout2_e" = x"ffffffff" ]; then +		return 0 +	fi -        if [ $layout1 == "0x0000000100000000000000007ffffffe" ] -        then -                if [ $layout2 == "0x00000001000000007fffffffffffffff" ] -		then -			return 0 -		else -			return 1 -		fi -        fi - -	if [ $layout2 == "0x0000000100000000000000007ffffffe" ] -        then -                if [ $layout1 == "0x00000001000000007fffffffffffffff" ] -		then -			return 0 -		else -			return 1 -		fi -        fi  	return 1  } diff --git a/tests/features/weighted-rebalance.t b/tests/features/weighted-rebalance.t new file mode 100755 index 00000000000..a5e746970ae --- /dev/null +++ b/tests/features/weighted-rebalance.t @@ -0,0 +1,91 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../dht.rc + +NFILES=1000 + +touch_files () { +	for i in $(seq 1 $NFILES); do +		touch $(printf $M0/dir/file%02d $i) 2> /dev/null +	done +} + +count_files () { +	found=0 +	for i in $(seq 1 $NFILES); do +		if [ -f $(printf $1/dir/file%02d $i) ]; then +			found=$((found+1)) +		fi +	done +	echo $found +} + +wait_for_rebalance () { +	while true; do +		rebalance_completed +		if [ $? -eq 1 ]; then +			sleep 1 +		else +			break +		fi +	done +} + +get_xattr () { +	cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht" +	$cmd $1 | od -tx1 -An | tr -d ' ' +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST mkdir ${B0}/${V0}{1,2} + +TEST truncate --size $((40*1024*1024)) ${B0}/disk1 +TEST mkfs.xfs -f -i size=512 ${B0}/disk1 +TEST mount -o loop ${B0}/disk1 ${B0}/${V0}1 + +TEST truncate --size $((80*1024*1024)) ${B0}/disk2 +TEST mkfs.xfs -f -i size=512 ${B0}/disk2 +TEST mount -o loop ${B0}/disk2 ${B0}/${V0}2 + +TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2} +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +# Create some files for later tests. +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 +TEST mkdir $M0/dir +TEST touch_files +TEST umount $M0 + +# Check that the larger brick got more of the files. +nfiles=$(count_files ${B0}/${V0}2) +echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty +TEST [ $nfiles -ge 580 ] + +# Turn off the size-weighted rebalance. +TEST $CLI volume set $V0 cluster.weighted-rebalance off + +# Rebalance again and check that the distribution is even again. +TEST $CLI volume rebalance $V0 start force +TEST wait_for_rebalance +nfiles=$(count_files ${B0}/${V0}2) +echo $nfiles $(get_xattr ${B0}/${V0}1) $(get_xattr ${B0}/${V0}2) > /dev/tty +TEST [ $nfiles -le 580 ] + +exit 0 + +$CLI volume stop $V0 +umount ${B0}/${V0}{1,2} +rm -f ${B0}/disk{1,2} + +cleanup diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 5dee622a2a1..54f885d18b0 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -208,6 +208,7 @@ struct dht_du {  	double   avail_inodes;          uint64_t avail_space;          uint32_t log; +        uint32_t chunks;  };  typedef struct dht_du dht_du_t; @@ -315,6 +316,9 @@ struct dht_conf {          char            *xattr_name;          char            *link_xattr_name;          char            *wild_xattr_name; + +        /* Support size-weighted rebalancing (heterogeneous bricks). */ +        gf_boolean_t    do_weighting;          gf_boolean_t    randomize_by_gfid;  };  typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 8664f550ba2..a2dc43c32aa 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -37,6 +37,8 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	double         percent = 0;  	double         percent_inodes = 0;  	uint64_t       bytes = 0; +        uint32_t       bpc;     /* blocks per chunk */ +        uint32_t       chunks   = 0;  	conf = this->private;  	prev = cookie; @@ -50,17 +52,28 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	if (statvfs && statvfs->f_blocks) {  		percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;  		bytes = (statvfs->f_bavail * statvfs->f_frsize); +                /* +                 * A 32-bit count of 1MB chunks allows a maximum brick size of +                 * ~4PB.  It's possible that we could see a single local FS +                 * bigger than that some day, but this code is likely to be +                 * irrelevant by then.  Meanwhile, it's more important to keep +                 * the chunk size small so the layout-calculation code that +                 * uses this value can be tested on normal machines. +                 */ +                bpc = (1 << 20) / statvfs->f_bsize; +                chunks = (statvfs->f_blocks + bpc - 1) / bpc;  	}  	if (statvfs && statvfs->f_files) {  		percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;  	} else { -		/* set percent inodes to 100 for dynamically allocated inode filesystems -		   this logic holds good so that, distribute has nothing to worry about -		   total inodes rather let the 'create()' to be scheduled on the hashed -		   subvol regardless of the total inodes. since we have no awareness on -		   loosing inodes this logic fits well -		*/ +                /* +                 * Set percent inodes to 100 for dynamically allocated inode +                 * filesystems. The rationale is that distribute need not +                 * worry about total inodes; rather, let the 'create()' be +                 * scheduled on the hashed subvol regardless of the total +                 * inodes. +		 */  		percent_inodes = 100;  	} @@ -71,6 +84,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  				conf->du_stats[i].avail_percent = percent;  				conf->du_stats[i].avail_space   = bytes;  				conf->du_stats[i].avail_inodes  = percent_inodes; +                                conf->du_stats[i].chunks        = chunks;  				gf_msg_debug (this->name, 0,  				              "subvolume '%s': avail_percent "  					      "is: %.2f and avail_space " @@ -80,6 +94,7 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  					      conf->du_stats[i].avail_percent,  					      conf->du_stats[i].avail_space,  					      conf->du_stats[i].avail_inodes); +                                break;  /* no point in looping further */  			}  	}  	UNLOCK (&conf->subvolume_lock); diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index f476e44e0c1..a92dba89d2b 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -20,11 +20,11 @@  #include "dht-messages.h"  #include "glusterfs-acl.h" -#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path)    do {       \ +#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path)    do {           \                  layout->list[i].start = srt;                            \                  layout->list[i].stop  = srt + chunk - 1;                \                                                                          \ -                gf_msg_trace (this->name, 0,                       \ +                gf_msg_trace (this->name, 0,                            \                                "gave fix: %u - %u on %s for %s",         \                                layout->list[i].start,                    \                                layout->list[i].stop,                     \ @@ -952,6 +952,18 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,  		new_layout->list[i].xlator = layout->list[i].xlator;          } +        if (priv->du_stats) { +                for (i = 0; i < priv->subvolume_cnt; ++i) { +                        gf_log (this->name, GF_LOG_INFO, +                                "subvolume %d (%s): %u chunks", i, +                                priv->subvolumes[i]->name, +                                priv->du_stats[i].chunks); +                } +        } +        else { +                gf_log (this->name, GF_LOG_WARNING, "no du stats ?!?"); +        } +  	/* First give it a layout as though it is a new directory. This  	   ensures rotation to kick in */          dht_layout_sort_volname (new_layout); @@ -976,6 +988,32 @@ done:  } +/* + * Having to call this 2x for each entry in the layout is pretty horrible, but + * that's what all of this layout-sorting nonsense gets us. + */ +uint32_t +dht_get_chunks_from_xl (xlator_t *parent, xlator_t *child) +{ +        dht_conf_t      *priv   = parent->private; +        xlator_list_t   *trav; +        uint32_t        index   = 0; + +        if (!priv->du_stats) { +                return 0; +        } + +        for (trav = parent->children; trav; trav = trav->next) { +                if (trav->xlator == child) { +                        return priv->du_stats[index].chunks; +                } +                ++index; +        } + +        return 0; +} + +  void  dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,                                     dht_layout_t *layout) @@ -984,44 +1022,92 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,          uint32_t     chunk = 0;          int          i = 0;          uint32_t     start = 0; -        int          cnt = 0; +        int          bricks_to_use = 0;          int          err = 0;          int          start_subvol = 0; +        uint32_t     curr_size; +        uint32_t     total_size = 0; +        int          real_i; +        dht_conf_t   *priv; +        gf_boolean_t weight_by_size; +        int          bricks_used = 0;          this = frame->this; +        priv = this->private; +        weight_by_size = priv->do_weighting; + +        bricks_to_use = dht_get_layout_count (this, layout, 1); +        GF_ASSERT (bricks_to_use > 0); -        cnt = dht_get_layout_count (this, layout, 1); +        bricks_used = 0; +        for (i = 0; i < layout->cnt; ++i) { +                err = layout->list[i].err; +                if ((err != -1) && (err != ENOENT)) { +                        continue; +                } +                curr_size = dht_get_chunks_from_xl (this, +                                                    layout->list[i].xlator); +                if (!curr_size) { +                        weight_by_size = _gf_false; +                        break; +                } +                total_size += curr_size; +                if (++bricks_used >= bricks_to_use) { +                        break; +                } +        } -        chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); +        if (weight_by_size) { +                /* We know total_size is not zero. */ +                chunk = ((unsigned long) 0xffffffff) / total_size; +                gf_log (this->name, GF_LOG_INFO, +                        "chunk size = 0xffffffff / %u = 0x%x", +                        total_size, chunk); +        } +        else { +                chunk = ((unsigned long) 0xffffffff) / bricks_used; +        }          start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);          /* clear out the range, as we are re-computing here */          DHT_RESET_LAYOUT_RANGE (layout); -        for (i = start_subvol; i < layout->cnt; i++) { -                err = layout->list[i].err; -                if (err == -1 || err == ENOENT) { -                        DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, -                                             cnt, loc->path); -                        if (--cnt == 0) { -                                layout->list[i].stop = 0xffffffff; -                                goto done; -                        } -                        start += chunk; -                } -        } -        for (i = 0; i < start_subvol; i++) { +        /* +         * OK, what's this "real_i" stuff about?  This used to be two loops - +         * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. +         * That way is practically an open invitation to bugs when only one +         * of the loops is updated.  Using real_i and modulo operators to make +         * it one loop avoids this problem.  Remember, folks: it's everyone's +         * responsibility to help stamp out copy/paste abuse. +         */ +        bricks_used = 0; +        for (real_i = 0; real_i < layout->cnt; real_i++) { +                i = (real_i + start_subvol) % layout->cnt;                  err = layout->list[i].err; -                if (err == -1 || err == ENOENT) { -                        DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, -                                             cnt, loc->path); -                        if (--cnt == 0) { -                                layout->list[i].stop = 0xffffffff; -                                goto done; +                if ((err != -1) && (err != ENOENT)) { +                        continue; +                } +                if (weight_by_size) { +                        curr_size = dht_get_chunks_from_xl (this, +                                layout->list[i].xlator); +                        if (!curr_size) { +                                continue;                          } -                        start += chunk;                  } +                else { +                        curr_size = 1; +                } +                gf_log (this->name, GF_LOG_INFO, +                        "assigning range size 0x%x to %s", chunk * curr_size, +                        layout->list[i].xlator->name); +                DHT_SET_LAYOUT_RANGE(layout, i, start, chunk * curr_size, +                                     loc->path); +                if (++bricks_used >= bricks_to_use) { +                        layout->list[i].stop = 0xffffffff; +                        goto done; +                } +                start += (chunk * curr_size);          }  done: diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 4748d2a4f61..f8faecf6870 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -419,6 +419,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)          dht_init_regex (this, options, "extra-hash-regex",                          &conf->extra_regex, &conf->extra_regex_valid); +        GF_OPTION_RECONF ("weighted-rebalance", conf->do_weighting, options, +                          bool, out); +          ret = 0;  out:          return ret; @@ -658,6 +661,8 @@ dht_init (xlator_t *this)                  goto err;          } +        GF_OPTION_INIT ("weighted-rebalance", conf->do_weighting, bool, err); +          this->private = conf;          return 0; @@ -790,6 +795,14 @@ struct volume_options options[] = {            "below it."          }, +        { .key = {"weighted-rebalance"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "When enabled, files will be allocated to bricks " +          "with a probability proportional to their size.  Otherwise, all " +          "bricks will have the same probability (legacy behavior)." +        }, +          /* NUFA option */          { .key  = {"local-volume-name"},            .type = GF_OPTION_TYPE_XLATOR diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 92ab3d1a3a3..5358d52a43a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -420,6 +420,10 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = 3,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "cluster.weighted-rebalance", +          .voltype    = "cluster/distribute", +          .op_version = GD_OP_VERSION_3_6_0, +        },          /* Switch xlator options (Distribute special case) */          { .key        = "cluster.switch",  | 
