Add scripts for size-weighted rebalance.

Change-Id: I04197e54fef2ff7b61fbee21ab837219354184f1 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/3573 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
author: Jeff Darcy <jdarcy@redhat.com> 2012-06-14 16:25:09 -0400
committer: Anand Avati <avati@redhat.com> 2013-02-07 14:27:47 -0800
commit: 08d63afa1b26e4f42ab1b85a14e6bfc3836de28e (patch)
tree: e259fbfdc5b915ca61cd7f7edc4513fa0fb1d54a /extras/rebalance.py
parent: 6a6ec980421a115a379aed97364817c16ce7b378 (diff)
1 files changed, 299 insertions, 0 deletions
diff --git a/extras/rebalance.py b/extras/rebalance.py
new file mode 100755
index 00000000000..80c614c5dfe
--- /dev/null
+++ b/extras/rebalance.py
@@ -0,0 +1,299 @@
+#!/usr/bin/python
+
+import atexit
+import copy
+import optparse
+import os
+import pipes
+import shutil
+import string
+import subprocess
+import sys
+import tempfile
+import volfilter
+
+# It's just more convenient to have named fields.
+class Brick:
+        def __init__ (self, path, name):
+                self.path = path
+                self.sv_name = name
+                self.size = 0
+                self.curr_size = 0
+                self.good_size = 0
+        def set_size (self, size):
+                self.size = size
+        def set_range (self, rs, re):
+                self.r_start = rs
+                self.r_end = re
+                self.curr_size = self.r_end - self.r_start + 1
+        def __repr__ (self):
+                value = self.path[:]
+                value += "(%d," % self.size
+                if self.curr_size:
+                        value += "0x%x,0x%x)" % (self.r_start, self.r_end)
+                else:
+                        value += "-)"
+                return value
+
+def get_bricks (host, vol):
+        t = pipes.Template()
+        t.prepend("gluster --remote-host=%s system getspec %s"%(host,vol),".-")
+        return t.open(None,"r")
+
+def generate_stanza (vf, all_xlators, cur_subvol):
+        sv_list = []
+        for sv in cur_subvol.subvols:
+                generate_stanza(vf,all_xlators,sv)
+                sv_list.append(sv.name)
+        vf.write("volume %s\n"%cur_subvol.name)
+        vf.write("  type %s\n"%cur_subvol.type)
+        for kvpair in cur_subvol.opts.iteritems():
+                vf.write("  option %s %s\n"%kvpair)
+        if sv_list:
+                vf.write("  subvolumes %s\n"%string.join(sv_list))
+        vf.write("end-volume\n\n")
+
+
+def mount_brick (localpath, all_xlators, dht_subvol):
+
+        # Generate a volfile.
+        vf_name = localpath + ".vol"
+        vf = open(vf_name,"w")
+        generate_stanza(vf,all_xlators,dht_subvol)
+        vf.flush()
+        vf.close()
+
+        # Create a brick directory and mount the brick there.
+        os.mkdir(localpath)
+        subprocess.call(["glusterfs","-f",vf_name,localpath])
+
+# We use the command-line tools because there's no getxattr support in the
+# Python standard library (which is ridiculous IMO).  Adding the xattr package
+# from PyPI would create a new and difficult dependency because the bits to
+# satisfy it don't seem to exist in Fedora.  We already expect the command-line
+# tools to be there, so it's safer just to rely on them.
+#
+# We might have to revisit this if we get as far as actually issuing millions
+# of setxattr requests.  Even then, it might be better to do that part with a C
+# program which has only a build-time dependency.
+def get_range (brick):
+        t = pipes.Template()
+        cmd = "getfattr -e hex -n trusted.glusterfs.dht %s 2> /dev/null"
+        t.prepend(cmd%brick,".-")
+        t.append("grep ^trusted.glusterfs.dht=","--")
+        f = t.open(None,"r")
+        try:
+                value = f.readline().rstrip().split('=')[1][2:]
+        except:
+                print "could not get layout for %s (might be OK)" % brick
+                return None
+        v_start = int("0x"+value[16:24],16)
+        v_end = int("0x"+value[24:32],16)
+        return (v_start, v_end)
+
+def calc_sizes (bricks, total):
+        leftover = 1 << 32
+        for b in bricks:
+               if b.size:
+                        b.good_size = (b.size << 32) / total
+                        leftover -= b.good_size
+               else:
+                        b.good_size = 0
+        if leftover:
+                # Add the leftover to an old brick if we can.
+                for b in bricks:
+                        if b.good_size:
+                                b.good_size += leftover
+                                break
+                else:
+                        # Fine, just add it wherever.
+                        bricks[0].good_size += leftover
+
+# Normalization means sorting the bricks by r_start and (b) ensuring that there
+# are no gaps.
+def normalize (in_bricks):
+        out_bricks = []
+        curr_hash = 0
+        used = 0
+        while curr_hash < (1<<32):
+                curr_best = None
+                for b in in_bricks:
+                        if b.r_start == curr_hash:
+                                used += 1
+                                out_bricks.append(b)
+                                in_bricks.remove(b)
+                                curr_hash = b.r_end + 1
+                                break
+                else:
+                        print "gap found at 0x%08x" % curr_hash
+                        sys.exit(1)
+        return out_bricks + in_bricks, used
+
+def get_score (bricks):
+        score = 0
+        curr_hash = 0
+        for b in bricks:
+                if not b.curr_size:
+                        curr_hash += b.good_size
+                        continue
+                new_start = curr_hash
+                curr_hash += b.good_size
+                new_end = curr_hash - 1
+                if new_start > b.r_start:
+                        max_start = new_start
+                else:
+                        max_start = b.r_start
+                if new_end < b.r_end:
+                        min_end = new_end
+                else:
+                        min_end = b.r_end
+                if max_start <= min_end:
+                        score += (min_end - max_start + 1)
+        return score
+
+if __name__ == "__main__":
+
+	my_usage = "%prog [options] server volume [directory]"
+	parser = optparse.OptionParser(usage=my_usage)
+        parser.add_option("-f", "--free-space", dest="free_space",
+                          default=False, action="store_true",
+                          help="use free space instead of total space")
+        parser.add_option("-l", "--leave-mounted", dest="leave_mounted",
+                          default=False, action="store_true",
+                          help="leave subvolumes mounted")
+        parser.add_option("-v", "--verbose", dest="verbose",
+                          default=False, action="store_true",
+                          help="verbose output")
+	options, args = parser.parse_args()
+
+        if len(args) == 3:
+                fix_dir = args[2]
+        else:
+                if len(args) != 2:
+                        parser.print_help()
+                        sys.exit(1)
+                fix_dir = None
+        hostname, volname = args[:2]
+
+        # Make sure stuff gets cleaned up, even if there are exceptions.
+        orig_dir = os.getcwd()
+        work_dir = tempfile.mkdtemp()
+        bricks = []
+        def cleanup_workdir ():
+                os.chdir(orig_dir)
+                if options.verbose:
+                        print "Cleaning up %s" % work_dir
+                for b in bricks:
+                        subprocess.call(["umount",b.path])
+                shutil.rmtree(work_dir)
+        if not options.leave_mounted:
+                atexit.register(cleanup_workdir)
+        os.chdir(work_dir)
+
+        # Mount each brick individually, so we can issue brick-specific calls.
+        if options.verbose:
+                print "Mounting subvolumes..."
+        index = 0
+        volfile_pipe = get_bricks(hostname,volname)
+        all_xlators, last_xlator = volfilter.load(volfile_pipe)
+        for dht_vol in all_xlators.itervalues():
+                if dht_vol.type == "cluster/distribute":
+                        break
+        else:
+                print "no DHT volume found"
+                sys.exit(1)
+        for sv in dht_vol.subvols:
+                #print "found subvol %s" % sv.name
+                lpath = "%s/brick%s" % (work_dir, index)
+                index += 1
+                mount_brick(lpath,all_xlators,sv)
+                bricks.append(Brick(lpath,sv.name))
+        if index == 0:
+                print "no bricks"
+                sys.exit(1)
+
+        # Collect all of the sizes.
+        if options.verbose:
+                print "Collecting information..."
+        total = 0
+        for b in bricks:
+                info = os.statvfs(b.path)
+                # We want a standard unit even if different bricks use
+                # different block sizes.  The size is chosen to avoid overflows
+                # for very large bricks with very small block sizes, but also
+                # accommodate filesystems which use very large block sizes to
+                # cheat on benchmarks.
+                blocksper100mb = 104857600 / info[0]
+                if options.free_space:
+                        size = info[3] / blocksper100mb
+                else:
+                        size = info[2] / blocksper100mb
+                if size <= 0:
+                        print "brick %s has invalid size %d" % (b.path, size)
+                        sys.exit(1)
+                b.set_size(size)
+                total += size
+
+        # Collect all of the layout information.
+        for b in bricks:
+                hash_range = get_range(b.path)
+                if hash_range is not None:
+                        rs, re = hash_range
+                        if rs > re:
+                                print "%s has backwards hash range" % b.path
+                                sys.exit(1)
+                        b.set_range(hash_range[0],hash_range[1])
+
+        if options.verbose:
+                print "Calculating new layouts..."
+        calc_sizes(bricks,total)
+        bricks, used = normalize(bricks)
+
+        # We can't afford O(n!) here, but O(n^2) should be OK and the result
+        # should be almost as good.
+        while used < len(bricks):
+                best_place = used
+                best_score = get_score(bricks)
+                for i in xrange(used):
+                        new_bricks = bricks[:]
+                        del new_bricks[used]
+                        new_bricks.insert(i,bricks[used])
+                        new_score = get_score(new_bricks)
+                        if new_score > best_score:
+                                best_place = i
+                                best_score = new_score
+                if best_place != used:
+                        nb = bricks[used]
+                        del bricks[used]
+                        bricks.insert(best_place,nb)
+                used += 1
+
+        # Finalize whatever we decided on.
+        curr_hash = 0
+        for b in bricks:
+                b.r_start = curr_hash
+                curr_hash += b.good_size
+                b.r_end = curr_hash - 1
+
+        print "Here are the xattr values for your size-weighted layout:"
+        for b in bricks:
+                print "  %s: 0x0000000200000000%08x%08x" % (
+                        b.sv_name, b.r_start, b.r_end)
+
+        if fix_dir:
+                if options.verbose:
+                        print "Fixing layout for %s" % fix_dir
+                for b in bricks:
+                        value = "0x0000000200000000%08x%08x" % (
+                                b.r_start, b.r_end)
+                        path = "%s/%s" % (b.path, fix_dir)
+                        cmd = "setfattr -n trusted.glusterfs.dht -v %s %s" % (
+                                value, path)
+                        print cmd
+
+        if options.leave_mounted:
+                print "The following subvolumes are still mounted:"
+                for b in bricks:
+                        print "%s on %s" % (b.sv_name, b.path)
+                print "Don't forget to clean up when you're done."
+
author	Jeff Darcy <jdarcy@redhat.com>	2012-06-14 16:25:09 -0400
committer	Anand Avati <avati@redhat.com>	2013-02-07 14:27:47 -0800
commit	08d63afa1b26e4f42ab1b85a14e6bfc3836de28e (patch)
tree	e259fbfdc5b915ca61cd7f7edc4513fa0fb1d54a /extras/rebalance.py
parent	6a6ec980421a115a379aed97364817c16ce7b378 (diff)