diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2013-02-05 19:19:06 -0500 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2013-02-07 08:27:40 -0800 |
commit | da9d54cac629d9c0f7ae6b431abfb134b5f0eca3 (patch) | |
tree | 27f021d1db5756e098856ab4dea806ee8f0e02e9 | |
parent | 7a9e27073789bdc0164cc6428aee5088a8faa2fb (diff) |
dht: better layout-optimization algorithm
This method deals with the case where swapping might gain a bigger overlap
for the xlator currently under consideration, but sacrifices even more from
the xlator we're swapping with. For example:
A = 0x00000000 - 0x44444443 (new 0x00000000 - 0x55555554)
B = 0x44444444 - 0x77777776 (new 0x55555555 - 0xaaaaaaa9)
C = 0x77777777 - 0xffffffff (new 0xaaaaaaaa - 0xffffffff)
Here, the new range for B has a bigger overlap with the old C than with the
old B (0x33333333 vs. 0x22222222 to be precise) so looking only at that
might lead us to swap. However, such a swap turns the new C's overlap from
0x55555556 (vs. old C) to *zero* (vs. old B). In other words, we've gained
0x11111111 for B but lost 0x55555556 for C, so it's a bad idea.
The new algorithm accounts for all effects of the swap, so it not only avoids
bad swaps but can make some good ones that would have been missed previously.
For example, if swapping a range X with a later range Y would not increase the
overlap for X we would previously have skipped it even if the swap would
increase Y's overlap without affecting X's. This is the normal case when we're
adding a new brick (which initially has zero overlap with any old range) so
finding more good swaps is probably even more important than avoiding bad ones.
Also, the logic in dht_overlap_calc was completely broken before, causing
integer overflows instead of providing correct values, so no matter what
higher-level algorithm was in place the GIGO effect would have resulted in
bad decisions.
Change-Id: If61ed513cfcb931916c6b51da293e3efbaaf385f
BUG: 853258
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.org/3908
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
-rwxr-xr-x | tests/bugs/bug-853258.t | 45 | ||||
-rwxr-xr-x | tests/bugs/overlap.py | 59 | ||||
-rw-r--r-- | tests/volume.rc | 9 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 16 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 82 |
5 files changed, 187 insertions, 24 deletions
diff --git a/tests/bugs/bug-853258.t b/tests/bugs/bug-853258.t new file mode 100755 index 00000000000..c702e6f30ea --- /dev/null +++ b/tests/bugs/bug-853258.t @@ -0,0 +1,45 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd + +mkdir -p $H0:$B0/${V0}0 +mkdir -p $H0:$B0/${V0}1 +mkdir -p $H0:$B0/${V0}2 +mkdir -p $H0:$B0/${V0}3 + +# Create and start a volume. +TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 $H0:$B0/${V0}2 +TEST $CLI volume start $V0 +EXPECT_WITHIN 15 'Started' volinfo_field $V0 'Status'; + +# Force assignment of initial ranges. +TEST $CLI volume rebalance $V0 fix-layout start +EXPECT_WITHIN 15 "success:" rebalance_status_field $V0 + +# Get the original values. +xattrs="" +for i in $(seq 0 2); do + xattrs="$xattrs $(dht_get_layout $B0/${V0}$i)" +done + +# Expand the volume and force assignment of new ranges. +TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3 +# Force assignment of initial ranges. +TEST $CLI volume rebalance $V0 fix-layout start +EXPECT_WITHIN 15 "success:" rebalance_status_field $V0 + +for i in $(seq 0 3); do + xattrs="$xattrs $(dht_get_layout $B0/${V0}$i)" +done + +overlap=$($(dirname $0)/overlap.py $xattrs) +# 2863311531 = 0xaaaaaaab = 2/3 overlap +TEST [ "$overlap" -ge 2863311531 ] + +cleanup diff --git a/tests/bugs/overlap.py b/tests/bugs/overlap.py new file mode 100755 index 00000000000..15f2da473f1 --- /dev/null +++ b/tests/bugs/overlap.py @@ -0,0 +1,59 @@ +#!/usr/bin/python + +import sys + +def calculate_one (ov, nv): + old_start = int(ov[18:26],16) + old_end = int(ov[26:34],16) + new_start = int(nv[18:26],16) + new_end = int(nv[26:34],16) + if (new_end < old_start) or (new_start > old_end): + #print '%s, %s -> ZERO' % (ov, nv) + return 0 + all_start = max(old_start,new_start) + all_end = min(old_end,new_end) + #print '%s, %s -> %08x' % (ov, nv, all_end - all_start + 1) + return all_end - all_start + 1 + +def calculate_all (values): + total = 0 + nv_index = len(values) / 2 + for old_val in values[:nv_index]: + new_val = values[nv_index] + nv_index += 1 + total += calculate_one(old_val,new_val) + return total + +""" +test1_vals = [ + '0x0000000000000000000000003fffffff', # first quarter + '0x0000000000000000400000007fffffff', # second quarter + '0x000000000000000080000000ffffffff', # second half + '0x00000000000000000000000055555554', # first third + '0x000000000000000055555555aaaaaaa9', # second third + '0x0000000000000000aaaaaaaaffffffff', # last third +] + +test2_vals = [ + '0x0000000000000000000000003fffffff', # first quarter + '0x0000000000000000400000007fffffff', # second quarter + '0x000000000000000080000000ffffffff', # second half + '0x00000000000000000000000055555554', # first third + # Next two are (incorrectly) swapped. + '0x0000000000000000aaaaaaaaffffffff', # last third + '0x000000000000000055555555aaaaaaa9', # second third +] + +print '%08x' % calculate_one(test1_vals[0],test1_vals[3]) +print '%08x' % calculate_one(test1_vals[1],test1_vals[4]) +print '%08x' % calculate_one(test1_vals[2],test1_vals[5]) +print '= %08x' % calculate_all(test1_vals) +print '%08x' % calculate_one(test2_vals[0],test2_vals[3]) +print '%08x' % calculate_one(test2_vals[1],test2_vals[4]) +print '%08x' % calculate_one(test2_vals[2],test2_vals[5]) +print '= %08x' % calculate_all(test2_vals) +""" + +if __name__ == '__main__': + # Return decimal so bash can reason about it. + print '%d' % calculate_all(sys.argv[1:]) diff --git a/tests/volume.rc b/tests/volume.rc index 9daf560b7b8..2a6ada3e6c8 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -26,8 +26,8 @@ function volume_option() $CLI volume info $vol | egrep "^$key: " | cut -f2 -d' '; } -function rebalance_status_completed_field { - $CLI volume rebalance $V0 status | awk '{print $6}' | sed -n 3p +function rebalance_status_field { + $CLI volume rebalance $1 status | sed -n '$p' | cut -d' ' -f4 } function remove_brick_status_completed_field { @@ -194,3 +194,8 @@ function gd_is_replace_brick_completed { echo "N" fi } + +function dht_get_layout { + local my_xa=trusted.glusterfs.dht + getfattr -d -e hex -n $my_xa $1 2> /dev/null | grep "$my_xa=" | cut -d= -f2 +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 8cae5265391..34a7475bdc3 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -412,6 +412,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) layout->list[j].err = err_swap; } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} + int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 8463abdbfaf..77afde82e16 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -38,12 +38,20 @@ dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) if (old->list[o].err > 0 || new->list[n].err > 0) return 0; + if (old->list[o].start == old->list[o].stop) { + return 0; + } + + if (new->list[n].start == new->list[n].stop) { + return 0; + } + if ((old->list[o].start > new->list[n].stop) || (old->list[o].stop < new->list[n].start)) return 0; - return max (old->list[o].start, new->list[n].start) - - min (old->list[o].stop, new->list[n].stop); + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; } @@ -564,18 +572,25 @@ void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *new_layout); void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] void dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, dht_layout_t *new, dht_layout_t *old) { int i = 0; - int k = 0; + int j = 0; uint32_t curr_overlap = 0; uint32_t max_overlap = 0; int max_overlap_idx = -1; uint32_t overlap = 0; - + uint32_t *table = NULL; dht_layout_sort_volname (old); /* Now both old_layout->list[] and new_layout->list[] @@ -584,31 +599,54 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, to the same subvolumes */ + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*new->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + for (i = 0; i < new->cnt; i++) { - if (new->list[i].err > 0) + if (new->list[i].err > 0) { /* Subvol might be marked for decommission with EINVAL, or some other serious error marked with positive errno. */ continue; + } - curr_overlap = dht_overlap_calc (old, i, new, i); - max_overlap = curr_overlap; - max_overlap_idx = i; - - /* Subvols up to `i' have already found their best - matches. Search only from the rest. - */ - for (k = (i + 1); k < new->cnt; k++) { - overlap = dht_overlap_calc (old, i, new, k); - if (overlap > max_overlap) { - max_overlap_idx = k; - max_overlap = overlap; - } - } - - if (max_overlap > curr_overlap) - dht_layout_entry_swap (new, i, max_overlap_idx); + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } } } |