summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src
diff options
context:
space:
mode:
authorSusant Palai <spalai@redhat.com>2020-04-27 16:59:16 +0530
committerSusant Palai <spalai@redhat.com>2020-07-31 16:22:23 +0000
commit3af9443c770837abe4f54db399623380ab9767a7 (patch)
tree943b631cc5210a721b90287cea201b03bf78a1fe /xlators/cluster/dht/src
parentdbff4ecfc18d4d4ad357e2f53806a6caf69d2b65 (diff)
dht: optimize rebalance crawl path
For distribute only volumes we can use the information for local subvolumes to avoid syncop calls which goes through the whole stack to fetch stat and entries. A separate function gf_defrag_fix_layout_puredist is introduced. TODO: A glusterd flag needs to be introduced in case we want to fall back to run the old way. Perf numbers: DirSize - 1Million Old New %diff Depth - 100 (Run 1) 353 74 +377% Depth - 100 (Run 2) 348 72 +377~% Depth - 50 246 122 +100% Depth - 3 174 114 +52% Change-Id: I67cc136cebd34092fd775e69f74c2d5b33d3156d Fixes: #1242 Signed-off-by: Susant Palai <spalai@redhat.com>
Diffstat (limited to 'xlators/cluster/dht/src')
-rw-r--r--xlators/cluster/dht/src/dht-common.c114
-rw-r--r--xlators/cluster/dht/src/dht-common.h11
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c400
-rw-r--r--xlators/cluster/dht/src/dht-shared.c4
4 files changed, 524 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 65c9c0b0a31..be92236e3bd 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -11465,3 +11465,117 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
/* Returning the first xlator error as all xlators have errors */
return layout->list[0].err;
}
+
+/* Get brick paths from all the local subvols and store for use.
+ *
+ * TODO: Make sure newly added brick is not picked for migration.
+ * Otherwise there will be no rebalance as directory entries won't be present
+ * on a newly added brick */
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+ dict_t *dict = NULL;
+ gf_defrag_info_t *defrag = conf->defrag;
+ char *key = NULL;
+ char *tmp = NULL;
+ char *str = NULL;
+ char *token;
+ char *saveptr = NULL;
+ int i = 1;
+ int j = 0;
+ int ret = 0;
+
+ key = gf_strdup("glusterfs.pathinfo");
+ if (!key) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "failed to allocate "
+ "memory");
+ ret = -1;
+ goto out;
+ }
+
+ defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt,
+ sizeof(*defrag->local_brick_paths),
+ gf_common_mt_pointer);
+
+ for (j = 0; j < conf->local_subvols_cnt; j++) {
+ ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL,
+ NULL);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "failed to get path,"
+ " errno %d",
+ ret);
+ /* TODO: We need not break out from here and can resume operation.
+ * We need a place holder in gf_defrag_info_t to mark which
+ * local_brick_paths we are working on. Right now, we blindly
+ * take defrag->local_brick_path[0]. This can be dynamic based on
+ * need */
+ goto out;
+ }
+
+ str = NULL;
+ ret = dict_get_str(dict, key, &str);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s",
+ key);
+ goto out;
+ }
+ if (str == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key);
+ ret = -1;
+ goto out;
+ }
+
+ if (!defrag->is_pure_distribute) {
+ tmp = strstr(str, "REPLICATE");
+ if (tmp) {
+ defrag->is_pure_distribute = _gf_false;
+ break;
+ }
+
+ /*TODO: fetching glusterfs.pathinfo on erasure volume is failing.
+ *Function the old way till we get it resolved */
+ tmp = strstr(str, "ERASURE");
+ if (tmp) {
+ defrag->is_pure_distribute = _gf_false;
+ break;
+ }
+
+ defrag->is_pure_distribute = _gf_true;
+ }
+
+ saveptr = NULL;
+
+ for (token = strtok_r(str, ":", &saveptr), i = 1; token;) {
+ token = strtok_r(NULL, ":", &saveptr);
+ i++;
+ if (i == 3) {
+ token = strtok_r(token, ">", &saveptr);
+ break;
+ } else {
+ continue;
+ }
+ }
+
+ defrag->local_brick_paths[j] = gf_strdup(token);
+ }
+
+out:
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "failed to get brick path. "
+ "Will operate old way");
+ for (j = 0; j < conf->local_subvols_cnt; j++) {
+ GF_FREE(defrag->local_brick_paths[j]);
+ }
+ defrag->is_pure_distribute = _gf_false;
+ }
+
+ if (defrag->is_pure_distribute) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute");
+ }
+
+ GF_FREE(key);
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 028c6ac6b9f..84891406c71 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -598,6 +598,15 @@ struct gf_defrag_info_ {
gf_boolean_t stats;
/* lock migration flag */
gf_boolean_t lock_migration_enabled;
+
+ /* local system crawl */
+ char **local_brick_paths;
+
+ /* whether the volume is pure distribute */
+ gf_boolean_t is_pure_distribute;
+
+ /*TODO: Introduce a glusterd option to tune this behaviour*/
+ gf_boolean_t operate_dist;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -1482,4 +1491,6 @@ dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
int
dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc);
#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c141ffce90d..d850eef62ab 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -14,6 +14,7 @@
#include <signal.h>
#include <glusterfs/events.h>
#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <string.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -4052,6 +4053,368 @@ out:
}
int
+gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout,
+ dict_t *migrate_data)
+{
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int should_commit_hash = 1;
+ int perrno = 0;
+ /* absolute brick path length */
+ int brick_len = 0;
+ /* dir path length (relative to gluster mount) */
+ int dir_len = 0;
+ /* absolute dir path length */
+ int total_len = 0;
+ struct dirent *entry = NULL;
+ struct dirent scratch[2] = {{
+ 0,
+ }};
+ DIR *dirp = NULL;
+ int full_entry_length = 0;
+ int entry_len = 0;
+ char full_entry_path[4096] = {
+ 0,
+ };
+ char full_dir_path[4096] = {
+ 0,
+ };
+ ssize_t size = 0;
+ uuid_t tmp_gfid;
+ struct stat tmpbuf = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+
+ struct stat lstatbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Since the primary target for the following lookup is to figure out if the
+ * entry still exists, going to do a direct stat call rather than going
+ * through the whole gluster stack. There are some benefits of doing gluster
+ * lookup, but this is redundant since we have done already one gluster
+ * lookup in the parent function.
+ *
+ * Randomly selecting the first local subvol to read, since it is expected
+ * that the directory structure is present in all the subvols identically
+ */
+
+ brick_len = strlen(defrag->local_brick_paths[0]);
+ /* discarding the first "/" */
+ dir_len = strlen(loc->path) - 1;
+ /* Extra two: one for "/" at the end and one more for '\0'*/
+ total_len = brick_len + dir_len + 2;
+
+ snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0],
+ loc->path + 1);
+
+ ret = sys_lstat(full_dir_path, &tmpbuf);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "[absolutepath %s] directory "
+ "not found, path %s error %d",
+ full_dir_path, loc->path, errno);
+ goto out;
+ }
+
+ dirp = sys_opendir(full_dir_path);
+ if (!dirp) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ goto out;
+ }
+
+ while ((entry = sys_readdir(dirp, scratch)) != NULL) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ goto out;
+ }
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ||
+ !strcmp(entry->d_name, ".glusterfs"))
+ continue;
+
+ /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back
+ to stat in case d_type is not defined */
+ if (entry->d_type != DT_DIR) {
+ continue;
+ }
+
+ entry_len = strlen(entry->d_name);
+ full_entry_length = total_len + entry_len + 1; /* one more for "/"*/
+
+ snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path,
+ entry->d_name);
+
+ size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16);
+ if (size != 16) {
+ gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s",
+ full_entry_path);
+ continue;
+ }
+
+ loc_wipe(&entry_loc);
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ should_commit_hash = 0;
+
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(tmp_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, tmp_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ *and the gfid obtained in the lookup differs, then
+ *client3_3_lookup_cbk will return ESTALE and proper
+ *error will be captured.
+ */
+ memset(&lstatbuf, 0, sizeof(struct stat));
+ ret = sys_lstat(full_entry_path, &lstatbuf);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s",
+ entry->d_name);
+ }
+
+ memset(&stbuf, 0, sizeof(struct iatt));
+ iatt_from_stat(&stbuf, &lstatbuf);
+ gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid);
+ linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name,
+ &stbuf);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ should_commit_hash = 0;
+ continue;
+ }
+ }
+ }
+
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc,
+ fix_layout, migrate_data);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
+
+ if (ret && ret != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
+ }
+ }
+ }
+
+ ret = sys_closedir(dirp);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to close dir %s. Reason :"
+ " %s",
+ full_dir_path, strerror(errno));
+ ret = 0;
+ }
+
+ dirp = NULL;
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+ if (ret && (ret != 2)) {
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
+ goto out;
+ } else {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
+
+ if (conf->decommission_in_progress) {
+ goto out;
+ }
+
+ should_commit_hash = 0;
+ }
+ } else if (ret == 2) {
+ should_commit_hash = 0;
+ }
+ }
+
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+ if (should_commit_hash &&
+ gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
+
+ ret = -1;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ loc_wipe(&entry_loc);
+
+ if (fd)
+ fd_unref(fd);
+
+ if (ret == 0 && should_commit_hash == 0) {
+ ret = 2;
+ }
+
+ if (dirp) {
+ sys_closedir(dirp);
+ }
+
+ return ret;
+}
+
+int
dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
loc_t *loc)
{
@@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)
pthread_t *tid = NULL;
pthread_t filecnt_thread;
gf_boolean_t fc_thread_started = _gf_false;
+ int i = 0;
this = data;
if (!this)
@@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)
goto out;
}
+ ret = dht_get_brick_paths(this, conf, &loc);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "could not get brick path");
+ ret = 0;
+ }
+
/* Initialise the structures required for parallel migration */
ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
&thread_index);
@@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)
}
}
- ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ /* TODO: Need to introduce a flag to safely operate in the old way */
+ if (defrag->operate_dist && defrag->is_pure_distribute) {
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
if (ret != 2 &&
@@ -4606,6 +4988,14 @@ out:
}
UNLOCK(&defrag->lock);
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ if (defrag->local_brick_paths[i]) {
+ GF_FREE(defrag->local_brick_paths[i]);
+ }
+ }
+
+ GF_FREE(defrag->local_brick_paths);
+
GF_FREE(defrag);
conf->defrag = NULL;
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index d85b4d1ce13..811bb55925f 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -700,6 +700,10 @@ dht_init(xlator_t *this)
pthread_cond_init(&defrag->fc_wakeup_cond, 0);
defrag->global_error = 0;
+
+ defrag->is_pure_distribute = _gf_false;
+
+ defrag->operate_dist = _gf_true;
}
conf->use_fallocate = 1;