From 08d63afa1b26e4f42ab1b85a14e6bfc3836de28e Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Thu, 14 Jun 2012 16:25:09 -0400 Subject: Add scripts for size-weighted rebalance. Change-Id: I04197e54fef2ff7b61fbee21ab837219354184f1 Signed-off-by: Jeff Darcy Reviewed-on: http://review.gluster.org/3573 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- extras/rebalance.py | 299 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100755 extras/rebalance.py (limited to 'extras/rebalance.py') diff --git a/extras/rebalance.py b/extras/rebalance.py new file mode 100755 index 00000000..80c614c5 --- /dev/null +++ b/extras/rebalance.py @@ -0,0 +1,299 @@ +#!/usr/bin/python + +import atexit +import copy +import optparse +import os +import pipes +import shutil +import string +import subprocess +import sys +import tempfile +import volfilter + +# It's just more convenient to have named fields. +class Brick: + def __init__ (self, path, name): + self.path = path + self.sv_name = name + self.size = 0 + self.curr_size = 0 + self.good_size = 0 + def set_size (self, size): + self.size = size + def set_range (self, rs, re): + self.r_start = rs + self.r_end = re + self.curr_size = self.r_end - self.r_start + 1 + def __repr__ (self): + value = self.path[:] + value += "(%d," % self.size + if self.curr_size: + value += "0x%x,0x%x)" % (self.r_start, self.r_end) + else: + value += "-)" + return value + +def get_bricks (host, vol): + t = pipes.Template() + t.prepend("gluster --remote-host=%s system getspec %s"%(host,vol),".-") + return t.open(None,"r") + +def generate_stanza (vf, all_xlators, cur_subvol): + sv_list = [] + for sv in cur_subvol.subvols: + generate_stanza(vf,all_xlators,sv) + sv_list.append(sv.name) + vf.write("volume %s\n"%cur_subvol.name) + vf.write(" type %s\n"%cur_subvol.type) + for kvpair in cur_subvol.opts.iteritems(): + vf.write(" option %s %s\n"%kvpair) + if sv_list: + vf.write(" subvolumes %s\n"%string.join(sv_list)) + vf.write("end-volume\n\n") + + +def mount_brick (localpath, all_xlators, dht_subvol): + + # Generate a volfile. + vf_name = localpath + ".vol" + vf = open(vf_name,"w") + generate_stanza(vf,all_xlators,dht_subvol) + vf.flush() + vf.close() + + # Create a brick directory and mount the brick there. + os.mkdir(localpath) + subprocess.call(["glusterfs","-f",vf_name,localpath]) + +# We use the command-line tools because there's no getxattr support in the +# Python standard library (which is ridiculous IMO). Adding the xattr package +# from PyPI would create a new and difficult dependency because the bits to +# satisfy it don't seem to exist in Fedora. We already expect the command-line +# tools to be there, so it's safer just to rely on them. +# +# We might have to revisit this if we get as far as actually issuing millions +# of setxattr requests. Even then, it might be better to do that part with a C +# program which has only a build-time dependency. +def get_range (brick): + t = pipes.Template() + cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null" + t.prepend(cmd%brick,".-") + t.append("grep ^trusted.glusterfs.dht=","--") + f = t.open(None,"r") + try: + value = f.readline().rstrip().split('=')[1][2:] + except: + print "could not get layout for %s (might be OK)" % brick + return None + v_start = int("0x"+value[16:24],16) + v_end = int("0x"+value[24:32],16) + return (v_start, v_end) + +def calc_sizes (bricks, total): + leftover = 1 << 32 + for b in bricks: + if b.size: + b.good_size = (b.size << 32) / total + leftover -= b.good_size + else: + b.good_size = 0 + if leftover: + # Add the leftover to an old brick if we can. + for b in bricks: + if b.good_size: + b.good_size += leftover + break + else: + # Fine, just add it wherever. + bricks[0].good_size += leftover + +# Normalization means sorting the bricks by r_start and (b) ensuring that there +# are no gaps. +def normalize (in_bricks): + out_bricks = [] + curr_hash = 0 + used = 0 + while curr_hash < (1<<32): + curr_best = None + for b in in_bricks: + if b.r_start == curr_hash: + used += 1 + out_bricks.append(b) + in_bricks.remove(b) + curr_hash = b.r_end + 1 + break + else: + print "gap found at 0x%08x" % curr_hash + sys.exit(1) + return out_bricks + in_bricks, used + +def get_score (bricks): + score = 0 + curr_hash = 0 + for b in bricks: + if not b.curr_size: + curr_hash += b.good_size + continue + new_start = curr_hash + curr_hash += b.good_size + new_end = curr_hash - 1 + if new_start > b.r_start: + max_start = new_start + else: + max_start = b.r_start + if new_end < b.r_end: + min_end = new_end + else: + min_end = b.r_end + if max_start <= min_end: + score += (min_end - max_start + 1) + return score + +if __name__ == "__main__": + + my_usage = "%prog [options] server volume [directory]" + parser = optparse.OptionParser(usage=my_usage) + parser.add_option("-f", "--free-space", dest="free_space", + default=False, action="store_true", + help="use free space instead of total space") + parser.add_option("-l", "--leave-mounted", dest="leave_mounted", + default=False, action="store_true", + help="leave subvolumes mounted") + parser.add_option("-v", "--verbose", dest="verbose", + default=False, action="store_true", + help="verbose output") + options, args = parser.parse_args() + + if len(args) == 3: + fix_dir = args[2] + else: + if len(args) != 2: + parser.print_help() + sys.exit(1) + fix_dir = None + hostname, volname = args[:2] + + # Make sure stuff gets cleaned up, even if there are exceptions. + orig_dir = os.getcwd() + work_dir = tempfile.mkdtemp() + bricks = [] + def cleanup_workdir (): + os.chdir(orig_dir) + if options.verbose: + print "Cleaning up %s" % work_dir + for b in bricks: + subprocess.call(["umount",b.path]) + shutil.rmtree(work_dir) + if not options.leave_mounted: + atexit.register(cleanup_workdir) + os.chdir(work_dir) + + # Mount each brick individually, so we can issue brick-specific calls. + if options.verbose: + print "Mounting subvolumes..." + index = 0 + volfile_pipe = get_bricks(hostname,volname) + all_xlators, last_xlator = volfilter.load(volfile_pipe) + for dht_vol in all_xlators.itervalues(): + if dht_vol.type == "cluster/distribute": + break + else: + print "no DHT volume found" + sys.exit(1) + for sv in dht_vol.subvols: + #print "found subvol %s" % sv.name + lpath = "%s/brick%s" % (work_dir, index) + index += 1 + mount_brick(lpath,all_xlators,sv) + bricks.append(Brick(lpath,sv.name)) + if index == 0: + print "no bricks" + sys.exit(1) + + # Collect all of the sizes. + if options.verbose: + print "Collecting information..." + total = 0 + for b in bricks: + info = os.statvfs(b.path) + # We want a standard unit even if different bricks use + # different block sizes. The size is chosen to avoid overflows + # for very large bricks with very small block sizes, but also + # accommodate filesystems which use very large block sizes to + # cheat on benchmarks. + blocksper100mb = 104857600 / info[0] + if options.free_space: + size = info[3] / blocksper100mb + else: + size = info[2] / blocksper100mb + if size <= 0: + print "brick %s has invalid size %d" % (b.path, size) + sys.exit(1) + b.set_size(size) + total += size + + # Collect all of the layout information. + for b in bricks: + hash_range = get_range(b.path) + if hash_range is not None: + rs, re = hash_range + if rs > re: + print "%s has backwards hash range" % b.path + sys.exit(1) + b.set_range(hash_range[0],hash_range[1]) + + if options.verbose: + print "Calculating new layouts..." + calc_sizes(bricks,total) + bricks, used = normalize(bricks) + + # We can't afford O(n!) here, but O(n^2) should be OK and the result + # should be almost as good. + while used < len(bricks): + best_place = used + best_score = get_score(bricks) + for i in xrange(used): + new_bricks = bricks[:] + del new_bricks[used] + new_bricks.insert(i,bricks[used]) + new_score = get_score(new_bricks) + if new_score > best_score: + best_place = i + best_score = new_score + if best_place != used: + nb = bricks[used] + del bricks[used] + bricks.insert(best_place,nb) + used += 1 + + # Finalize whatever we decided on. + curr_hash = 0 + for b in bricks: + b.r_start = curr_hash + curr_hash += b.good_size + b.r_end = curr_hash - 1 + + print "Here are the xattr values for your size-weighted layout:" + for b in bricks: + print " %s: 0x0000000200000000%08x%08x" % ( + b.sv_name, b.r_start, b.r_end) + + if fix_dir: + if options.verbose: + print "Fixing layout for %s" % fix_dir + for b in bricks: + value = "0x0000000200000000%08x%08x" % ( + b.r_start, b.r_end) + path = "%s/%s" % (b.path, fix_dir) + cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % ( + value, path) + print cmd + + if options.leave_mounted: + print "The following subvolumes are still mounted:" + for b in bricks: + print "%s on %s" % (b.sv_name, b.path) + print "Don't forget to clean up when you're done." + -- cgit