diff options
| author | Anand Avati <avati@redhat.com> | 2012-08-30 13:15:39 -0700 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2012-09-12 14:29:51 -0700 | 
| commit | 4f87fd0ae2ce629576ca5f647a99888d31a46815 (patch) | |
| tree | b548adb73477f5e23905adc18fefd90fa9c9e3e8 /xlators/cluster/dht/src | |
| parent | c3d7286a67ce0ac4db9cb8fa079a48f423245000 (diff) | |
dht: improve dht_fix_layout_of_directory for better re-assignment
Jeff Darcy wrote:
> AFAICT, the fix-layout code doesn't do the same rotation that the
> new-directory code does. Therefore, the new bricks always claim
> completely predictable hash ranges for every directory, leading to
> either a 0-1-2-3 pattern or a 1-0-2-3 pattern.  In other words, a
> file whose hash falls into the second quarter of the range will always
> be assigned to brick 2, and a file whose hash falls into the fourth
> quarter will always be assigned to brick 3.  The rest will be split
> according to the original pattern.  Put still another way, instead of
> same-named files in different directories being spread across N bricks,
> they might be spread across only two bricks (bad) or totally
> concentrated on one brick (worse) regardless of N.
The current dht_fix_layout_of_directory() code, in an attempt to
maximize overlap of new layout with existing layout (to minimize
movement of data) fails to do a good job of randomizing new assignment
even when it could do a better job. In an example where we expand
from 2 nodes to 4 nodes, the current possibilities are limited in the
following way -
(theoretical hash range: 00 - 99)
OLD 1
-----
server1: 00 - 49
server2: 50 - 99
NEW 1
-----
server1: 00 - 24
server2: 50 - 74
server3: 25 - 49
server4: 75 - 99
OLD 2
-----
server1: 50 - 99
server2: 00 - 49
NEW 2
------
server1: 50 - 74
server2: 00 - 24
server3: 25 - 49
server4: 75 - 99
The above shows that when add-brick from 2 bricks to 4 bricks, server3
and server4 always get the _same_ hash range no matter what the original
hash range assignment was.
The fix in this patch is first do the standard new directory assignment
to a directory (with rotation etc.) and then do the reassignment to
maximize overlap. This way newly added servers still get random ranges
and existing servers have a probability of getting either of the quarters
which were part of its half previously. The same principles hold for
all add-brick from M to M+N.
Change-Id: I0cbbf3bfa334645728072d66aaaa80120d0b295f
BUG: 853258
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/3883
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster/dht/src')
| -rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 223 | 
1 files changed, 78 insertions, 145 deletions
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 8b7d15ce520..8ac5a9e9e44 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -28,42 +28,25 @@                          layout->list[i].xlator->name, path);            \          } while (0) -static inline uint32_t -dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop, -                  uint32_t chunk_size) + +static uint32_t +dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)  { -        uint32_t overlap = 0; -        uint32_t chunk_begin = 0; +	if (o >= old->cnt || n >= new->cnt) +		return 0; -        chunk_begin = cnk_idx * chunk_size; +	if (old->list[o].err > 0 || new->list[n].err > 0) +		return 0; -        /* There is no chance of overlap */ -        if ((chunk_begin > stop) || -            ((chunk_begin + chunk_size) < start)) -                goto out; +	if ((old->list[o].start > new->list[n].stop) || +	    (old->list[o].stop < new->list[n].start)) +		return 0; -        if ((chunk_begin <= start) && -            ((chunk_begin + chunk_size) <= stop)) { -                overlap = ((chunk_begin + chunk_size) - start); -                goto out; -        } - -        if ((chunk_begin <= start) && -            ((chunk_begin + chunk_size) >= stop)) { -                overlap = (stop - start); -                goto out; -        } - -        if ((chunk_begin < stop) && -            ((chunk_begin + chunk_size) >= stop)) { -                overlap = (stop - chunk_begin); -                goto out; -        } - -out: -        return overlap; +	return max (old->list[o].start, new->list[n].start) - +		min (old->list[o].stop, new->list[n].stop);  } +  int  dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)  { @@ -516,7 +499,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)                  for (j = 0; j < conf->subvolume_cnt; j++) {                          if (conf->decommissioned_bricks[j] &&                              conf->decommissioned_bricks[j] == layout->list[i].xlator) { -                                layout->list[i].err = -EINVAL; +                                layout->list[i].err = EINVAL;                                  break;                          }                  } @@ -548,24 +531,64 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)  } +void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, +					dht_layout_t *new_layout); + +void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); + +void +dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, +				      dht_layout_t *new, dht_layout_t *old) +{ +	int           i            = 0; +	int           k            = 0; +	uint32_t      curr_overlap = 0; +	uint32_t      max_overlap  = 0; +	int           max_overlap_idx = -1; +	uint32_t      overlap      = 0; + + +	dht_layout_sort_volname (old); +	/* Now both old_layout->list[] and new_layout->list[] +	   are match the same xlators/subvolumes. i.e, +	   old_layout->[i] and new_layout->[i] are referring +	   to the same subvolumes +	*/ + +	for (i = 0; i < new->cnt; i++) { +		if (new->list[i].err > 0) +			/* Subvol might be marked for decommission +			   with EINVAL, or some other serious error +			   marked with positive errno. +			*/ +			continue; + +		curr_overlap = dht_overlap_calc (old, i, new, i); +		max_overlap = curr_overlap; +		max_overlap_idx = i; + +		/* Subvols up to `i' have already found their best +		   matches. Search only from the rest. +		*/ +		for (k = (i + 1); k < new->cnt; k++) { +			overlap = dht_overlap_calc (old, i, new, k); +			if (overlap > max_overlap) { +				max_overlap_idx = k; +				max_overlap = overlap; +			} +		} + +		if (max_overlap > curr_overlap) +			dht_layout_entry_swap (new, i, max_overlap_idx); +	} +} + +  dht_layout_t *  dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,                               dht_layout_t *layout)  { -        uint32_t      chunk        = 0; -        uint32_t      start        = 0; -        uint32_t      stop         = 0; -        uint32_t      overlap      = 0; -        uint32_t      max_overlap  = 0; -        uint32_t      chunk_begin  = 0; -        int           count        = 0; -        int           cnt          = 0;          int           i            = 0; -        int           j            = 0; -        int           k            = 0; -        int           loop_cnt     = 0; -        int           start_subvol = 0; -        int          *fix_array    = NULL;          xlator_t     *this         = NULL;          dht_layout_t *new_layout   = NULL;          dht_conf_t   *priv         = NULL; @@ -581,114 +604,26 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,                  goto done;          } -        count = cnt = dht_get_layout_count (this, layout, 0); - -        chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); - -        start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); - -        fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char); -        if (!fix_array) { -                /* No fix, use the existing layout itself */ -                goto done; -        } -          new_layout = dht_layout_new (this, priv->subvolume_cnt);          if (!new_layout)                  goto done;          for (i = 0; i < new_layout->cnt; i++) { -                /* TODO: fix this in layout_alloc() itself */ -                new_layout->list[i].err = -ENOENT; -                if (i < layout->cnt) -                        new_layout->list[i].xlator = layout->list[i].xlator; -        } - -        /* Check if there are any overlap in layout, and give the proper fix */ -        for (i = 0; i < layout->cnt; i++) { -                /* No need to fix if 'err' is not '-1' */ -                if (layout->list[i].err != -1) -                        continue; - -                /* If already existing layout is having no range, skip it */ -                start = layout->list[i].start; -                stop  = layout->list[i].stop; -                if ((stop - start) == 0) -                        continue; - -                max_overlap = 0; - -                /* 'j' is used as starting point of each chunk */ -                for (j = 1; j <= count; j++) { -                        /* if chunk is already used, don't use it again */ -                        for (k = 0; k < i; k++) -                                if (j == fix_array[k]) -                                        break; -                        if (k < i) -                                continue; - -                        overlap = dht_find_overlap (i, (j-1), start, stop, chunk); -                        if (max_overlap < overlap) { -                                max_overlap = overlap; -                                fix_array[i] = j; -                        } -                } - -                /* If we have any overlap, then use that itself as new -                   layout for the subvolume */ -                if (fix_array[i]) { -                        chunk_begin = chunk * (fix_array[i] - 1); -                        new_layout->list[i].err = -1; -                        DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, -                                              chunk, cnt, loc->path); -                        /* make sure to give (max - 1) as 'stop' range, -                           if it is last chunk */ -                        if (fix_array[i] == count) -                                new_layout->list[i].stop = 0xffffffff; -                        if (--cnt == 0) -                                goto done; +		if (layout->list[i].err != ENOSPC) +			new_layout->list[i].err = layout->list[i].err; +		else +			new_layout->list[i].err = -1; -                } +		new_layout->list[i].xlator = layout->list[i].xlator;          } -        /* Now, look for layouts which are not having any overlaps -           and give it a fix */ -        for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt; -             i++, loop_cnt++) { -                if (i == new_layout->cnt) -                        i = 0; - -                /* If 'fix_array[i]' is set, the layout is already fixed. */ -                if (fix_array[i]) -                        continue; - -                if (layout->list[i].err != -1) { -                        new_layout->list[i].err = layout->list[i].err; -                        continue; -                } - -                for (k = 1; k <= count; k++) { -                        for (j = 0; j < new_layout->cnt; j++) { -                                if (k == fix_array[j]) -                                        break; -                        } -                        /* Didn't find any of the list begining with 'k' */ -                        if (j == new_layout->cnt) -                                break; -                } +	/* First give it a layout as though it is a new directory. This +	   ensures rotation to kick in */ +        dht_layout_sort_volname (new_layout); +	dht_selfheal_layout_new_directory (frame, loc, new_layout); -                fix_array[i] = k; -                chunk_begin = (k - 1) * chunk; -                new_layout->list[i].err = -1; -                DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt, -                                      loc->path); -                /* make sure to give (max - 1) as 'stop' range, -                   if it is last chunk */ -                if (k == count) -                        new_layout->list[i].stop = 0xffffffff; -                if (--cnt == 0) -                        goto done; -        } +	/* Now selectively re-assign ranges only when it helps */ +	dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout);  done:          if (new_layout) { @@ -702,8 +637,6 @@ done:                  local->layout = new_layout;          } -        GF_FREE (fix_array); -          return local->layout;  }  | 
