diff options
| author | Jeff Darcy <jdarcy@redhat.com> | 2013-02-05 19:19:06 -0500 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2013-02-07 08:27:40 -0800 | 
| commit | da9d54cac629d9c0f7ae6b431abfb134b5f0eca3 (patch) | |
| tree | 27f021d1db5756e098856ab4dea806ee8f0e02e9 /xlators/cluster | |
| parent | 7a9e27073789bdc0164cc6428aee5088a8faa2fb (diff) | |
dht: better layout-optimization algorithm
This method deals with the case where swapping might gain a bigger overlap
for the xlator currently under consideration, but sacrifices even more from
the xlator we're swapping with. For example:
A = 0x00000000 - 0x44444443 (new 0x00000000 - 0x55555554)
B = 0x44444444 - 0x77777776 (new 0x55555555 - 0xaaaaaaa9)
C = 0x77777777 - 0xffffffff (new 0xaaaaaaaa - 0xffffffff)
Here, the new range for B has a bigger overlap with the old C than with the
old B (0x33333333 vs. 0x22222222 to be precise) so looking only at that
might lead us to swap. However, such a swap turns the new C's overlap from
0x55555556 (vs. old C) to *zero* (vs. old B).  In other words, we've gained
0x11111111 for B but lost 0x55555556 for C, so it's a bad idea.
The new algorithm accounts for all effects of the swap, so it not only avoids
bad swaps but can make some good ones that would have been missed previously.
For example, if swapping a range X with a later range Y would not increase the
overlap for X we would previously have skipped it even if the swap would
increase Y's overlap without affecting X's.  This is the normal case when we're
adding a new brick (which initially has zero overlap with any old range) so
finding more good swaps is probably even more important than avoiding bad ones.
Also, the logic in dht_overlap_calc was completely broken before, causing
integer overflows instead of providing correct values, so no matter what
higher-level algorithm was in place the GIGO effect would have resulted in
bad decisions.
Change-Id: If61ed513cfcb931916c6b51da293e3efbaaf385f
BUG: 853258
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: http://review.gluster.org/3908
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/cluster')
| -rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 16 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 82 | 
2 files changed, 76 insertions, 22 deletions
| diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 8cae52653..34a7475bd 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -412,6 +412,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)          layout->list[j].err    = err_swap;  } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ +        uint32_t  start_swap = 0; +        uint32_t  stop_swap = 0; + +        start_swap  = layout->list[i].start; +        stop_swap   = layout->list[i].stop; + +        layout->list[i].start  = layout->list[j].start; +        layout->list[i].stop   = layout->list[j].stop; + +        layout->list[j].start  = start_swap; +        layout->list[j].stop   = stop_swap; +} +  int64_t  dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)  { diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 8463abdbf..77afde82e 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -38,12 +38,20 @@ dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)  	if (old->list[o].err > 0 || new->list[n].err > 0)  		return 0; +        if (old->list[o].start == old->list[o].stop) { +                return 0; +        } + +        if (new->list[n].start == new->list[n].stop) { +                return 0; +        } +  	if ((old->list[o].start > new->list[n].stop) ||  	    (old->list[o].stop < new->list[n].start))  		return 0; -	return max (old->list[o].start, new->list[n].start) - -		min (old->list[o].stop, new->list[n].stop); +	return min (old->list[o].stop, new->list[n].stop) - +	        max (old->list[o].start, new->list[n].start) + 1;  } @@ -564,18 +572,25 @@ void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,  					dht_layout_t *new_layout);  void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y)      table[x*new->cnt+y]  void  dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,  				      dht_layout_t *new, dht_layout_t *old)  {  	int           i            = 0; -	int           k            = 0; +	int           j            = 0;  	uint32_t      curr_overlap = 0;  	uint32_t      max_overlap  = 0;  	int           max_overlap_idx = -1;  	uint32_t      overlap      = 0; - +        uint32_t     *table = NULL;  	dht_layout_sort_volname (old);  	/* Now both old_layout->list[] and new_layout->list[] @@ -584,31 +599,54 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,  	   to the same subvolumes  	*/ +        /* Build a table of overlaps between new[i] and old[j]. */ +        table = alloca(sizeof(overlap)*old->cnt*new->cnt); +        if (!table) { +                return; +        } +        memset(table,0,sizeof(overlap)*new->cnt*new->cnt); +        for (i = 0; i < new->cnt; ++i) { +                for (j = 0; j < old->cnt; ++j) { +                        OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); +                } +        } +  	for (i = 0; i < new->cnt; i++) { -		if (new->list[i].err > 0) +		if (new->list[i].err > 0) {  			/* Subvol might be marked for decommission  			   with EINVAL, or some other serious error  			   marked with positive errno.  			*/  			continue; +                } -		curr_overlap = dht_overlap_calc (old, i, new, i); -		max_overlap = curr_overlap; -		max_overlap_idx = i; - -		/* Subvols up to `i' have already found their best -		   matches. Search only from the rest. -		*/ -		for (k = (i + 1); k < new->cnt; k++) { -			overlap = dht_overlap_calc (old, i, new, k); -			if (overlap > max_overlap) { -				max_overlap_idx = k; -				max_overlap = overlap; -			} -		} - -		if (max_overlap > curr_overlap) -			dht_layout_entry_swap (new, i, max_overlap_idx); +                max_overlap = 0; +                max_overlap_idx = i; +                for (j = (i + 1); j < new->cnt; ++j) { +                        /* Calculate the overlap now. */ +                        curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); +                        /* Calculate the overlap after the proposed swap. */ +                        overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); +                        /* Are we better than status quo? */ +                        if (overlap > curr_overlap) { +                                overlap -= curr_overlap; +                                /* Are we better than the previous choice? */ +                                if (overlap > max_overlap) { +                                        max_overlap = overlap; +                                        max_overlap_idx = j; +                                } +                        } +                } + +		if (max_overlap_idx != i) { + 			dht_layout_range_swap (new, i, max_overlap_idx); +                        /* Need to swap the table values too. */ +                        for (j = 0; j < old->cnt; ++j) { +                                overlap = OV_ENTRY(i,j); +                                OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); +                                OV_ENTRY(max_overlap_idx,j) = overlap; +                        } +                }  	}  } | 
