diff options
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 114 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 11 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 400 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 4 |
4 files changed, 524 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 65c9c0b0a31..be92236e3bd 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -11465,3 +11465,117 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode) /* Returning the first xlator error as all xlators have errors */ return layout->list[0].err; } + +/* Get brick paths from all the local subvols and store for use. + * + * TODO: Make sure newly added brick is not picked for migration. + * Otherwise there will be no rebalance as directory entries won't be present + * on a newly added brick */ +int +dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc) +{ + dict_t *dict = NULL; + gf_defrag_info_t *defrag = conf->defrag; + char *key = NULL; + char *tmp = NULL; + char *str = NULL; + char *token; + char *saveptr = NULL; + int i = 1; + int j = 0; + int ret = 0; + + key = gf_strdup("glusterfs.pathinfo"); + if (!key) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "failed to allocate " + "memory"); + ret = -1; + goto out; + } + + defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt, + sizeof(*defrag->local_brick_paths), + gf_common_mt_pointer); + + for (j = 0; j < conf->local_subvols_cnt; j++) { + ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL, + NULL); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "failed to get path," + " errno %d", + ret); + /* TODO: We need not break out from here and can resume operation. + * We need a place holder in gf_defrag_info_t to mark which + * local_brick_paths we are working on. Right now, we blindly + * take defrag->local_brick_path[0]. This can be dynamic based on + * need */ + goto out; + } + + str = NULL; + ret = dict_get_str(dict, key, &str); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s", + key); + goto out; + } + if (str == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key); + ret = -1; + goto out; + } + + if (!defrag->is_pure_distribute) { + tmp = strstr(str, "REPLICATE"); + if (tmp) { + defrag->is_pure_distribute = _gf_false; + break; + } + + /*TODO: fetching glusterfs.pathinfo on erasure volume is failing. + *Function the old way till we get it resolved */ + tmp = strstr(str, "ERASURE"); + if (tmp) { + defrag->is_pure_distribute = _gf_false; + break; + } + + defrag->is_pure_distribute = _gf_true; + } + + saveptr = NULL; + + for (token = strtok_r(str, ":", &saveptr), i = 1; token;) { + token = strtok_r(NULL, ":", &saveptr); + i++; + if (i == 3) { + token = strtok_r(token, ">", &saveptr); + break; + } else { + continue; + } + } + + defrag->local_brick_paths[j] = gf_strdup(token); + } + +out: + if (ret == -1) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "failed to get brick path. " + "Will operate old way"); + for (j = 0; j < conf->local_subvols_cnt; j++) { + GF_FREE(defrag->local_brick_paths[j]); + } + defrag->is_pure_distribute = _gf_false; + } + + if (defrag->is_pure_distribute) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute"); + } + + GF_FREE(key); + return ret; +} diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 028c6ac6b9f..84891406c71 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -598,6 +598,15 @@ struct gf_defrag_info_ { gf_boolean_t stats; /* lock migration flag */ gf_boolean_t lock_migration_enabled; + + /* local system crawl */ + char **local_brick_paths; + + /* whether the volume is pure distribute */ + gf_boolean_t is_pure_distribute; + + /*TODO: Introduce a glusterd option to tune this behaviour*/ + gf_boolean_t operate_dist; }; typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -1482,4 +1491,6 @@ dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local); int dht_dir_layout_error_check(xlator_t *this, inode_t *inode); +int +dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc); #endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index c141ffce90d..d850eef62ab 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -14,6 +14,7 @@ #include <signal.h> #include <glusterfs/events.h> #include "glusterfs/compat-errno.h" // for ENODATA on BSD +#include <string.h> #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ @@ -4052,6 +4053,368 @@ out: } int +gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag, + loc_t *loc, dict_t *fix_layout, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = { + 0, + }; + fd_t *fd = NULL; + inode_t *linked_inode = NULL, *inode = NULL; + dht_conf_t *conf = NULL; + int should_commit_hash = 1; + int perrno = 0; + /* absolute brick path length */ + int brick_len = 0; + /* dir path length (relative to gluster mount) */ + int dir_len = 0; + /* absolute dir path length */ + int total_len = 0; + struct dirent *entry = NULL; + struct dirent scratch[2] = {{ + 0, + }}; + DIR *dirp = NULL; + int full_entry_length = 0; + int entry_len = 0; + char full_entry_path[4096] = { + 0, + }; + char full_dir_path[4096] = { + 0, + }; + ssize_t size = 0; + uuid_t tmp_gfid; + struct stat tmpbuf = { + 0, + }; + struct iatt iatt = { + 0, + }; + + struct stat lstatbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + + conf = this->private; + if (!conf) { + ret = -1; + goto out; + } + + /* + * Since the primary target for the following lookup is to figure out if the + * entry still exists, going to do a direct stat call rather than going + * through the whole gluster stack. There are some benefits of doing gluster + * lookup, but this is redundant since we have done already one gluster + * lookup in the parent function. + * + * Randomly selecting the first local subvol to read, since it is expected + * that the directory structure is present in all the subvols identically + */ + + brick_len = strlen(defrag->local_brick_paths[0]); + /* discarding the first "/" */ + dir_len = strlen(loc->path) - 1; + /* Extra two: one for "/" at the end and one more for '\0'*/ + total_len = brick_len + dir_len + 2; + + snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0], + loc->path + 1); + + ret = sys_lstat(full_dir_path, &tmpbuf); + if (ret == -1) { + gf_log(this->name, GF_LOG_ERROR, + "[absolutepath %s] directory " + "not found, path %s error %d", + full_dir_path, loc->path, errno); + goto out; + } + + dirp = sys_opendir(full_dir_path); + if (!dirp) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s", + loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + goto out; + } + + while ((entry = sys_readdir(dirp, scratch)) != NULL) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") || + !strcmp(entry->d_name, ".glusterfs")) + continue; + + /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back + to stat in case d_type is not defined */ + if (entry->d_type != DT_DIR) { + continue; + } + + entry_len = strlen(entry->d_name); + full_entry_length = total_len + entry_len + 1; /* one more for "/"*/ + + snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path, + entry->d_name); + + size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16); + if (size != 16) { + gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s", + full_entry_path); + continue; + } + + loc_wipe(&entry_loc); + + ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Child loc" + " build failed for entry: %s", + entry->d_name); + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + goto out; + } else { + should_commit_hash = 0; + + continue; + } + } + + if (gf_uuid_is_null(tmp_gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.gfid, tmp_gfid); + + /*In case the gfid stored in the inode by inode_link + *and the gfid obtained in the lookup differs, then + *client3_3_lookup_cbk will return ESTALE and proper + *error will be captured. + */ + memset(&lstatbuf, 0, sizeof(struct stat)); + ret = sys_lstat(full_entry_path, &lstatbuf); + if (ret == -1) { + gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s", + entry->d_name); + } + + memset(&stbuf, 0, sizeof(struct iatt)); + iatt_from_stat(&stbuf, &lstatbuf); + gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid); + linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name, + &stbuf); + + inode = entry_loc.inode; + entry_loc.inode = linked_inode; + inode_unref(inode); + + if (gf_uuid_is_null(loc->gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "Dir:%s renamed or removed. " + "Skipping", + loc->path); + ret = 0; + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s", + entry_loc.path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } else { + should_commit_hash = 0; + continue; + } + } + } + + /* A return value of 2 means, either process_dir or + * lookup of a dir failed. Hence, don't commit hash + * for the current directory*/ + + ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { + goto out; + } + + if (ret && ret != 2) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, + "Fix layout failed for %s", entry_loc.path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + goto out; + } else { + /* Let's not commit-hash if + * gf_defrag_fix_layout failed*/ + continue; + } + } + } + + ret = sys_closedir(dirp); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to close dir %s. Reason :" + " %s", + full_dir_path, strerror(errno)); + ret = 0; + } + + dirp = NULL; + + /* A directory layout is fixed only after its subdirs are healed to + * any newly added bricks. If the layout is fixed before subdirs are + * healed, the newly added brick will get a non-null layout. + * Any subdirs which hash to that layout will no longer show up + * in a directory listing until they are healed. + */ + + ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + + /* In case of a race where the directory is deleted just before + * layout setxattr, the errors are updated in the layout structure. + * We can use this information to make a decision whether the directory + * is deleted entirely. + */ + if (ret == 0) { + ret = dht_dir_layout_error_check(this, loc->inode); + ret = -ret; + } + + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed. Dir %s " + "renamed or removed", + loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed for %s", loc->path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } + } + } + + if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && + (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { + ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); + + if (ret && (ret != 2)) { + if (perrno == ENOENT || perrno == ESTALE) { + ret = 0; + goto out; + } else { + defrag->total_failures++; + + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, + "gf_defrag_process_dir failed for " + "directory: %s", + loc->path); + + if (conf->decommission_in_progress) { + goto out; + } + + should_commit_hash = 0; + } + } else if (ret == 2) { + should_commit_hash = 0; + } + } + + gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); + + if (should_commit_hash && + gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { + defrag->total_failures++; + + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, + "Settle hash failed for %s", loc->path); + + ret = -1; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto out; + } + } + + ret = 0; +out: + loc_wipe(&entry_loc); + + if (fd) + fd_unref(fd); + + if (ret == 0 && should_commit_hash == 0) { + ret = 2; + } + + if (dirp) { + sys_closedir(dirp); + } + + return ret; +} + +int dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf, loc_t *loc) { @@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data) pthread_t *tid = NULL; pthread_t filecnt_thread; gf_boolean_t fc_thread_started = _gf_false; + int i = 0; this = data; if (!this) @@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data) goto out; } + ret = dht_get_brick_paths(this, conf, &loc); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "could not get brick path"); + ret = 0; + } + /* Initialise the structures required for parallel migration */ ret = gf_defrag_parallel_migration_init(this, defrag, &tid, &thread_index); @@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data) } } - ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data); - if (ret && ret != 2) { - defrag->total_failures++; - ret = -1; - goto out; + /* TODO: Need to introduce a flag to safely operate in the old way */ + if (defrag->operate_dist && defrag->is_pure_distribute) { + ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout, + migrate_data); + if (ret && ret != 2) { + defrag->total_failures++; + ret = -1; + goto out; + } + } else { + ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, + migrate_data); + if (ret && ret != 2) { + defrag->total_failures++; + ret = -1; + goto out; + } } if (ret != 2 && @@ -4606,6 +4988,14 @@ out: } UNLOCK(&defrag->lock); + for (i = 0; i < conf->local_subvols_cnt; i++) { + if (defrag->local_brick_paths[i]) { + GF_FREE(defrag->local_brick_paths[i]); + } + } + + GF_FREE(defrag->local_brick_paths); + GF_FREE(defrag); conf->defrag = NULL; diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index d85b4d1ce13..811bb55925f 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -700,6 +700,10 @@ dht_init(xlator_t *this) pthread_cond_init(&defrag->fc_wakeup_cond, 0); defrag->global_error = 0; + + defrag->is_pure_distribute = _gf_false; + + defrag->operate_dist = _gf_true; } conf->use_fallocate = 1; |