diff options
-rw-r--r-- | tests/basic/distribute/rebal-all-nodes-migrate.t | 143 | ||||
-rw-r--r-- | tests/dht.rc | 24 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 64 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 9 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 8 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 88 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.c | 57 |
8 files changed, 376 insertions, 18 deletions
diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t new file mode 100644 index 00000000000..14f0a53b1f8 --- /dev/null +++ b/tests/basic/distribute/rebal-all-nodes-migrate.t @@ -0,0 +1,143 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../dht.rc + + +# Check if every single rebalance process migrated some files + +function cluster_rebal_all_nodes_migrated_files { + val=0 + a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}'); +# echo $a + b=($a) + for i in "${b[@]}" + do +# echo "$i"; + if [ "$i" -eq "0" ]; then + echo "false"; + val=1; + fi + done + echo $val +} + +cleanup + +TEST launch_cluster 3; +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + + +#Start with a pure distribute volume (multiple bricks on the same node) +TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + + +############################################################## + +# Next, a dist-rep volume +TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +# Next, a disperse volume +TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed + +# this will not work unless EC is changed to return all node-uuids +# comment this out once that patch is ready +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +cleanup diff --git a/tests/dht.rc b/tests/dht.rc index bf5e08b645e..53b00645e66 100644 --- a/tests/dht.rc +++ b/tests/dht.rc @@ -66,13 +66,33 @@ function get_hashed_brick() } +function cluster_rebalance_completed() +{ + val=1 + + # Rebalance status will be either "failed" or "completed" + + test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1) + if [ $? -ne 0 ] + then + val=0 + fi + + echo $val + # Do not *return* the value here. If it's non-zero, that will cause + # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to + # a spurious test failure. Nothing else checks the return value anyway + # (they all check the output) so there's no need for it to be non-zero + # just because grep didn't find what we want. +} + function rebalance_completed() { val=1 - test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1) + test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1) if [ $? -eq 0 ] then - val=0 + val=0 fi echo $val diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index af6345ecc2a..8b4fd5cf37b 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3001,6 +3001,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, out: return ret; } + + int dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xattr, @@ -3016,6 +3018,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, char *next_uuid_str = NULL; char *saveptr = NULL; uuid_t node_uuid = {0,}; + char *uuid_list_copy = NULL; + int count = 0; + int i = 0; + int index = 0; + int found = 0; VALIDATE_OR_GOTO (frame, out); @@ -3025,6 +3032,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; conf = this->private; + VALIDATE_OR_GOTO (conf->defrag, out); + + gf_msg_debug (this->name, 0, "subvol %s returned", prev->name); + LOCK (&frame->lock); { this_call_cnt = --local->call_cnt; @@ -3048,6 +3059,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* As DHT will not know details of its child xlators + * we need to parse this twice to get the count first + * and allocate memory later. + */ + count = 0; + index = conf->local_subvols_cnt; + + uuid_list_copy = gf_strdup (uuid_list); + for (uuid_str = strtok_r (uuid_list, " ", &saveptr); uuid_str; uuid_str = next_uuid_str) { @@ -3057,24 +3077,57 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR, "Failed to parse uuid" - " failed for %s", prev->name); + " for %s", prev->name); local->op_ret = -1; local->op_errno = EINVAL; goto unlock; } + count++; if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) { gf_msg_debug (this->name, 0, "subvol %s does not" "belong to this node", prev->name); } else { + + /* handle multiple bricks of the same replica + * on the same node */ + if (found) + continue; conf->local_subvols[(conf->local_subvols_cnt)++] - = prev; + = prev; + found = 1; gf_msg_debug (this->name, 0, "subvol %s belongs to" " this node", prev->name); - break; } } + + if (!found) { + local->op_ret = 0; + goto unlock; + } + + conf->local_nodeuuids[index].count = count; + conf->local_nodeuuids[index].uuids + = GF_CALLOC (count, sizeof (uuid_t), 1); + + /* The node-uuids are guaranteed to be returned in the same + * order as the bricks + * A null node-uuid is returned for a brick that is down. + */ + + saveptr = NULL; + i = 0; + + for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr); + uuid_str; + uuid_str = next_uuid_str) { + + next_uuid_str = strtok_r (NULL, " ", &saveptr); + gf_uuid_parse (uuid_str, + conf->local_nodeuuids[index].uuids[i]); + i++; + } } local->op_ret = 0; @@ -3092,8 +3145,13 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; unwind: + + GF_FREE (conf->local_nodeuuids[index].uuids); + conf->local_nodeuuids[index].uuids = NULL; + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata); out: + GF_FREE (uuid_list_copy); return 0; } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index c8cec133960..f982bf6ac1a 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -419,6 +419,7 @@ struct dht_container { xlator_t *this; loc_t *parent_loc; dict_t *migrate_data; + int local_subvol_index; }; typedef enum tier_mode_ { @@ -490,6 +491,12 @@ typedef struct gf_tier_conf { char volname[GD_VOLUME_NAME_MAX + 1]; } gf_tier_conf_t; +typedef struct subvol_nodeuuids { + uuid_t *uuids; + int count; +} subvol_nodeuuid_t; + + struct gf_defrag_info_ { uint64_t total_files; uint64_t total_data; @@ -540,6 +547,7 @@ struct gf_defrag_info_ { /* lock migration flag */ gf_boolean_t lock_migration_enabled; + }; typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -623,6 +631,7 @@ struct dht_conf { /*local subvol storage for rebalance*/ xlator_t **local_subvols; + subvol_nodeuuid_t *local_nodeuuids; int32_t local_subvols_cnt; /* diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 6f08f557730..38965298325 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -505,7 +505,6 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) goto out; inode = loc->inode; - local->hashed_subvol = dht_subvol_get_hashed (frame->this, loc); } if (fd) { @@ -844,7 +843,12 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf) conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *), gf_dht_mt_xlator_t); - if (!conf->local_subvols) { + + /* FIX FIX : do this dynamically*/ + conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t), + gf_dht_nodeuuids_t); + + if (!conf->local_subvols || !conf->local_nodeuuids) { return -1; } diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 3554f3f9c2d..19cccef537b 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -39,6 +39,7 @@ enum gf_dht_mem_types_ { gf_dht_mt_fd_ctx_t, gf_tier_mt_qfile_array_t, gf_dht_ret_cache_t, + gf_dht_nodeuuids_t, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a5d00e37c0e..a1266502d63 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2439,6 +2439,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) { return 0; } + +/* Return value + * 0 : this node does not migrate the file + * 1 : this node migrates the file + */ +int +gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid) +{ + int ret = 0; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; + int32_t index = 0; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = {0, }; + + conf = this->private; + + /* Pure distribute */ + + if (conf->local_nodeuuids[i].count == 1) { + return 1; + } + + str = uuid_utoa_r (gfid, buf); + + ret = dht_hash_compute (this, 0, str, &hashval); + if (ret == 0) { + index = (hashval % conf->local_nodeuuids[i].count); + if (!gf_uuid_compare (conf->defrag->node_uuid, + conf->local_nodeuuids[i].uuids[index])) + ret = 1; + } + return ret; +} + + int gf_defrag_migrate_single_file (void *opaque) { @@ -2517,6 +2554,13 @@ gf_defrag_migrate_single_file (void *opaque) goto out; } + if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index, + entry->d_stat.ia_gfid)) { + gf_msg_debug (this->name, 0, "Don't migrate %s ", + entry_loc.path); + goto out; + } + gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); gf_uuid_copy (entry_loc.pargfid, loc->gfid); @@ -2531,6 +2575,7 @@ gf_defrag_migrate_single_file (void *opaque) goto out; } + hashed_subvol = dht_subvol_get_hashed (this, &entry_loc); if (!hashed_subvol) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2953,6 +2998,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container, goto out; } + tmp_container->local_subvol_index = i; + tmp_container->df_entry->d_stat = df_entry->d_stat; tmp_container->df_entry->d_ino = df_entry->d_ino; @@ -4032,6 +4079,33 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc) } + +int +dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf, + loc_t *loc) +{ + + dict_t *dict = NULL; + int ret = -1; + + /* Find local subvolumes */ + ret = syncop_getxattr (this, loc, &dict, + GF_REBAL_FIND_LOCAL_SUBVOL, + NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + + ret = 0; +out: + return ret; +} + + int gf_defrag_start_crawl (void *data) { @@ -4056,6 +4130,7 @@ gf_defrag_start_crawl (void *data) gf_boolean_t is_tier_detach = _gf_false; call_frame_t *statfs_frame = NULL; xlator_t *old_THIS = NULL; + int j = 0; this = data; if (!this) @@ -4184,14 +4259,8 @@ gf_defrag_start_crawl (void *data) goto out; } - /* Find local subvolumes */ - ret = syncop_getxattr (this, &loc, &dict, - GF_REBAL_FIND_LOCAL_SUBVOL, - NULL, NULL); + ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc); if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " - "subvolume determination failed with error: %d", - -ret); ret = -1; goto out; } @@ -4199,6 +4268,11 @@ gf_defrag_start_crawl (void *data) for (i = 0 ; i < conf->local_subvols_cnt; i++) { gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols " "are %s", conf->local_subvols[i]->name); + for (j = 0; j < conf->local_nodeuuids[i].count; j++) { + gf_msg (this->name, GF_LOG_INFO, 0, 0, + "node uuids are %s", + uuid_utoa(conf->local_nodeuuids[i].uuids[j])); + } } ret = gf_defrag_total_file_cnt (this, &loc); diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index a8aebe00f69..e4b910eb0e6 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -198,10 +198,17 @@ out: static int tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag) { - int ret = -1; - dict_t *dict = NULL; - char *uuid_str = NULL; - uuid_t node_uuid = {0,}; + int ret = -1; + dict_t *dict = NULL; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + char *dup_str = NULL; + char *str = NULL; + char *save_ptr = NULL; + int count = 0; + uint32_t hashval = 0; + int32_t index = 0; + char buf[GF_UUID_BUF_SIZE] = {0,}; GF_VALIDATE_OR_GOTO ("tier", this, out); GF_VALIDATE_OR_GOTO (this->name, loc, out); @@ -215,15 +222,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag) goto out; } + + /* This returns multiple node-uuids now - one for each brick + * of the subvol. + */ + if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "Failed to get node-uuid for %s", loc->path); goto out; } + dup_str = gf_strdup (uuid_str); + str = dup_str; + + /* How many uuids returned? + * No need to check if one of these is that of the current node. + */ + + count = 1; + while ((str = strchr (str, ' '))) { + count++; + str++; + } + + /* Only one node-uuid - pure distribute? */ + if (count == 1) + goto check_node; + + uuid_utoa_r (loc->gfid, buf); + ret = dht_hash_compute (this, 0, buf, &hashval); + if (ret == 0) { + index = (hashval % count); + } + + count = 0; + str = dup_str; + while ((uuid_str = strtok_r (str, " ", &save_ptr))) { + if (count == index) + break; + count++; + str = NULL; + } + + +check_node: + if (gf_uuid_parse (uuid_str, node_uuid)) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "uuid_parse failed for %s", loc->path); + ret = -1; goto out; } @@ -239,6 +287,7 @@ out: if (dict) dict_unref(dict); + GF_FREE (dup_str); return ret; } |