dht: improve dht_fix_layout_of_directory for better re-assignment

Jeff Darcy wrote: > AFAICT, the fix-layout code doesn't do the same rotation that the > new-directory code does. Therefore, the new bricks always claim > completely predictable hash ranges for every directory, leading to > either a 0-1-2-3 pattern or a 1-0-2-3 pattern. In other words, a > file whose hash falls into the second quarter of the range will always > be assigned to brick 2, and a file whose hash falls into the fourth > quarter will always be assigned to brick 3. The rest will be split > according to the original pattern. Put still another way, instead of > same-named files in different directories being spread across N bricks, > they might be spread across only two bricks (bad) or totally > concentrated on one brick (worse) regardless of N. The current dht_fix_layout_of_directory() code, in an attempt to maximize overlap of new layout with existing layout (to minimize movement of data) fails to do a good job of randomizing new assignment even when it could do a better job. In an example where we expand from 2 nodes to 4 nodes, the current possibilities are limited in the following way - (theoretical hash range: 00 - 99) OLD 1 ----- server1: 00 - 49 server2: 50 - 99 NEW 1 ----- server1: 00 - 24 server2: 50 - 74 server3: 25 - 49 server4: 75 - 99 OLD 2 ----- server1: 50 - 99 server2: 00 - 49 NEW 2 ------ server1: 50 - 74 server2: 00 - 24 server3: 25 - 49 server4: 75 - 99 The above shows that when add-brick from 2 bricks to 4 bricks, server3 and server4 always get the _same_ hash range no matter what the original hash range assignment was. The fix in this patch is first do the standard new directory assignment to a directory (with rotation etc.) and then do the reassignment to maximize overlap. This way newly added servers still get random ranges and existing servers have a probability of getting either of the quarters which were part of its half previously. The same principles hold for all add-brick from M to M+N. Change-Id: I0cbbf3bfa334645728072d66aaaa80120d0b295f BUG: 853258 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/3883 Tested-by: Gluster Build System <jenkins@build.gluster.com>
author: Anand Avati <avati@redhat.com> 2012-08-30 13:15:39 -0700
committer: Anand Avati <avati@redhat.com> 2012-09-12 14:29:51 -0700
commit: 4f87fd0ae2ce629576ca5f647a99888d31a46815 (patch)
tree: b548adb73477f5e23905adc18fefd90fa9c9e3e8 /xlators/cluster/dht
parent: c3d7286a67ce0ac4db9cb8fa079a48f423245000 (diff)
1 files changed, 78 insertions, 145 deletions
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 8b7d15ce5..8ac5a9e9e 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -28,42 +28,25 @@
                         layout->list[i].xlator->name, path);            \
         } while (0)
 
-static inline uint32_t
-dht_find_overlap (int idx, int cnk_idx, uint32_t start, uint32_t stop,
-                  uint32_t chunk_size)
+
+static uint32_t
+dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
 {
-        uint32_t overlap = 0;
-        uint32_t chunk_begin = 0;
+	if (o >= old->cnt || n >= new->cnt)
+		return 0;
 
-        chunk_begin = cnk_idx * chunk_size;
+	if (old->list[o].err > 0 || new->list[n].err > 0)
+		return 0;
 
-        /* There is no chance of overlap */
-        if ((chunk_begin > stop) ||
-            ((chunk_begin + chunk_size) < start))
-                goto out;
+	if ((old->list[o].start > new->list[n].stop) ||
+	    (old->list[o].stop < new->list[n].start))
+		return 0;
 
-        if ((chunk_begin <= start) &&
-            ((chunk_begin + chunk_size) <= stop)) {
-                overlap = ((chunk_begin + chunk_size) - start);
-                goto out;
-        }
-
-        if ((chunk_begin <= start) &&
-            ((chunk_begin + chunk_size) >= stop)) {
-                overlap = (stop - start);
-                goto out;
-        }
-
-        if ((chunk_begin < stop) &&
-            ((chunk_begin + chunk_size) >= stop)) {
-                overlap = (stop - chunk_begin);
-                goto out;
-        }
-
-out:
-        return overlap;
+	return max (old->list[o].start, new->list[n].start) -
+		min (old->list[o].stop, new->list[n].stop);
 }
 
+
 int
 dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
 {
@@ -516,7 +499,7 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
                 for (j = 0; j < conf->subvolume_cnt; j++) {
                         if (conf->decommissioned_bricks[j] &&
                             conf->decommissioned_bricks[j] == layout->list[i].xlator) {
-                                layout->list[i].err = -EINVAL;
+                                layout->list[i].err = EINVAL;
                                 break;
                         }
                 }
@@ -548,24 +531,64 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
 }
 
 
+void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
+					dht_layout_t *new_layout);
+
+void dht_layout_entry_swap (dht_layout_t *layout, int i, int j);
+
+void
+dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
+				      dht_layout_t *new, dht_layout_t *old)
+{
+	int           i            = 0;
+	int           k            = 0;
+	uint32_t      curr_overlap = 0;
+	uint32_t      max_overlap  = 0;
+	int           max_overlap_idx = -1;
+	uint32_t      overlap      = 0;
+
+
+	dht_layout_sort_volname (old);
+	/* Now both old_layout->list[] and new_layout->list[]
+	   are match the same xlators/subvolumes. i.e,
+	   old_layout->[i] and new_layout->[i] are referring
+	   to the same subvolumes
+	*/
+
+	for (i = 0; i < new->cnt; i++) {
+		if (new->list[i].err > 0)
+			/* Subvol might be marked for decommission
+			   with EINVAL, or some other serious error
+			   marked with positive errno.
+			*/
+			continue;
+
+		curr_overlap = dht_overlap_calc (old, i, new, i);
+		max_overlap = curr_overlap;
+		max_overlap_idx = i;
+
+		/* Subvols up to `i' have already found their best
+		   matches. Search only from the rest.
+		*/
+		for (k = (i + 1); k < new->cnt; k++) {
+			overlap = dht_overlap_calc (old, i, new, k);
+			if (overlap > max_overlap) {
+				max_overlap_idx = k;
+				max_overlap = overlap;
+			}
+		}
+
+		if (max_overlap > curr_overlap)
+			dht_layout_entry_swap (new, i, max_overlap_idx);
+	}
+}
+
+
 dht_layout_t *
 dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
                              dht_layout_t *layout)
 {
-        uint32_t      chunk        = 0;
-        uint32_t      start        = 0;
-        uint32_t      stop         = 0;
-        uint32_t      overlap      = 0;
-        uint32_t      max_overlap  = 0;
-        uint32_t      chunk_begin  = 0;
-        int           count        = 0;
-        int           cnt          = 0;
         int           i            = 0;
-        int           j            = 0;
-        int           k            = 0;
-        int           loop_cnt     = 0;
-        int           start_subvol = 0;
-        int          *fix_array    = NULL;
         xlator_t     *this         = NULL;
         dht_layout_t *new_layout   = NULL;
         dht_conf_t   *priv         = NULL;
@@ -581,114 +604,26 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
                 goto done;
         }
 
-        count = cnt = dht_get_layout_count (this, layout, 0);
-
-        chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1);
-
-        start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
-
-        fix_array = GF_CALLOC (sizeof (int), layout->cnt, gf_common_mt_char);
-        if (!fix_array) {
-                /* No fix, use the existing layout itself */
-                goto done;
-        }
-
         new_layout = dht_layout_new (this, priv->subvolume_cnt);
         if (!new_layout)
                 goto done;
 
         for (i = 0; i < new_layout->cnt; i++) {
-                /* TODO: fix this in layout_alloc() itself */
-                new_layout->list[i].err = -ENOENT;
-                if (i < layout->cnt)
-                        new_layout->list[i].xlator = layout->list[i].xlator;
-        }
-
-        /* Check if there are any overlap in layout, and give the proper fix */
-        for (i = 0; i < layout->cnt; i++) {
-                /* No need to fix if 'err' is not '-1' */
-                if (layout->list[i].err != -1)
-                        continue;
-
-                /* If already existing layout is having no range, skip it */
-                start = layout->list[i].start;
-                stop  = layout->list[i].stop;
-                if ((stop - start) == 0)
-                        continue;
-
-                max_overlap = 0;
-
-                /* 'j' is used as starting point of each chunk */
-                for (j = 1; j <= count; j++) {
-                        /* if chunk is already used, don't use it again */
-                        for (k = 0; k < i; k++)
-                                if (j == fix_array[k])
-                                        break;
-                        if (k < i)
-                                continue;
-
-                        overlap = dht_find_overlap (i, (j-1), start, stop, chunk);
-                        if (max_overlap < overlap) {
-                                max_overlap = overlap;
-                                fix_array[i] = j;
-                        }
-                }
-
-                /* If we have any overlap, then use that itself as new
-                   layout for the subvolume */
-                if (fix_array[i]) {
-                        chunk_begin = chunk * (fix_array[i] - 1);
-                        new_layout->list[i].err = -1;
-                        DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin,
-                                              chunk, cnt, loc->path);
-                        /* make sure to give (max - 1) as 'stop' range,
-                           if it is last chunk */
-                        if (fix_array[i] == count)
-                                new_layout->list[i].stop = 0xffffffff;
-                        if (--cnt == 0)
-                                goto done;
+		if (layout->list[i].err != ENOSPC)
+			new_layout->list[i].err = layout->list[i].err;
+		else
+			new_layout->list[i].err = -1;
 
-                }
+		new_layout->list[i].xlator = layout->list[i].xlator;
         }
 
-        /* Now, look for layouts which are not having any overlaps
-           and give it a fix */
-        for (loop_cnt = 0, i = start_subvol; loop_cnt < new_layout->cnt;
-             i++, loop_cnt++) {
-                if (i == new_layout->cnt)
-                        i = 0;
-
-                /* If 'fix_array[i]' is set, the layout is already fixed. */
-                if (fix_array[i])
-                        continue;
-
-                if (layout->list[i].err != -1) {
-                        new_layout->list[i].err = layout->list[i].err;
-                        continue;
-                }
-
-                for (k = 1; k <= count; k++) {
-                        for (j = 0; j < new_layout->cnt; j++) {
-                                if (k == fix_array[j])
-                                        break;
-                        }
-                        /* Didn't find any of the list begining with 'k' */
-                        if (j == new_layout->cnt)
-                                break;
-                }
+	/* First give it a layout as though it is a new directory. This
+	   ensures rotation to kick in */
+        dht_layout_sort_volname (new_layout);
+	dht_selfheal_layout_new_directory (frame, loc, new_layout);
 
-                fix_array[i] = k;
-                chunk_begin = (k - 1) * chunk;
-                new_layout->list[i].err = -1;
-                DHT_SET_LAYOUT_RANGE (new_layout, i, chunk_begin, chunk, cnt,
-                                      loc->path);
-                /* make sure to give (max - 1) as 'stop' range,
-                   if it is last chunk */
-                if (k == count)
-                        new_layout->list[i].stop = 0xffffffff;
-                if (--cnt == 0)
-                        goto done;
-        }
+	/* Now selectively re-assign ranges only when it helps */
+	dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout);
 
 done:
         if (new_layout) {
@@ -702,8 +637,6 @@ done:
                 local->layout = new_layout;
         }
 
-        GF_FREE (fix_array);
-
         return local->layout;
 }
author	Anand Avati <avati@redhat.com>	2012-08-30 13:15:39 -0700
committer	Anand Avati <avati@redhat.com>	2012-09-12 14:29:51 -0700
commit	4f87fd0ae2ce629576ca5f647a99888d31a46815 (patch)
tree	b548adb73477f5e23905adc18fefd90fa9c9e3e8 /xlators/cluster/dht
parent	c3d7286a67ce0ac4db9cb8fa079a48f423245000 (diff)