/* Copyright (c) 2008-2012 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ /* TODO: add NS locking */ #include #include "dht-common.h" #include "dht-messages.h" #ifndef MAX #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #endif /* TODO: - use volumename in xattr instead of "dht" - use NS locks - handle all cases in self heal layout reconstruction - complete linkfile selfheal */ static void dht_layout_dump(dht_layout_t *layout, const char *prefix) { char key[GF_DUMP_MAX_BUF_LEN]; int i = 0; if (!layout) goto out; gf_proc_dump_build_key(key, prefix, "cnt"); gf_proc_dump_write(key, "%d", layout->cnt); gf_proc_dump_build_key(key, prefix, "preset"); gf_proc_dump_write(key, "%d", layout->preset); gf_proc_dump_build_key(key, prefix, "gen"); gf_proc_dump_write(key, "%d", layout->gen); if (layout->type != IA_INVAL) { gf_proc_dump_build_key(key, prefix, "inode type"); gf_proc_dump_write(key, "%d", layout->type); } if (!IA_ISDIR(layout->type)) goto out; for (i = 0; i < layout->cnt; i++) { gf_proc_dump_build_key(key, prefix, "list[%d].err", i); gf_proc_dump_write(key, "%d", layout->list[i].err); gf_proc_dump_build_key(key, prefix, "list[%d].start", i); gf_proc_dump_write(key, "0x%x", layout->list[i].start); gf_proc_dump_build_key(key, prefix, "list[%d].stop", i); gf_proc_dump_write(key, "0x%x", layout->list[i].stop); if (layout->list[i].xlator) { gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i); gf_proc_dump_write(key, "%s", layout->list[i].xlator->type); gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i); gf_proc_dump_write(key, "%s", layout->list[i].xlator->name); } } out: return; } int32_t dht_priv_dump(xlator_t *this) { char key_prefix[GF_DUMP_MAX_BUF_LEN]; char key[GF_DUMP_MAX_BUF_LEN]; int i = 0; dht_conf_t *conf = NULL; int ret = -1; if (!this) goto out; conf = this->private; if (!conf) goto out; ret = TRY_LOCK(&conf->subvolume_lock); if (ret != 0) { return ret; } gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv", this->name); gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt); for (i = 0; i < conf->subvolume_cnt; i++) { snprintf(key, sizeof(key), "subvolumes[%d]", i); gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, conf->subvolumes[i]->name); if (conf->file_layouts && conf->file_layouts[i]) { snprintf(key, sizeof(key), "file_layouts[%d]", i); dht_layout_dump(conf->file_layouts[i], key); } if (conf->dir_layouts && conf->dir_layouts[i]) { snprintf(key, sizeof(key), "dir_layouts[%d]", i); dht_layout_dump(conf->dir_layouts[i], key); } if (conf->subvolume_status) { snprintf(key, sizeof(key), "subvolume_status[%d]", i); gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]); } } gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); gf_proc_dump_write("gen", "%d", conf->gen); gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp); if (conf->du_stats && conf->subvolume_status) { for (i = 0; i < conf->subvolume_cnt; i++) { if (!conf->subvolume_status[i]) continue; snprintf(key, sizeof(key), "subvolumes[%d]", i); gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name); snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i); gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent); snprintf(key, sizeof(key), "du_stats[%d].avail_space", i); gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space); snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i); gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes); snprintf(key, sizeof(key), "du_stats[%d].log", i); gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log); } } if (conf->last_stat_fetch) gf_proc_dump_write("last_stat_fetch", "%s", ctime(&conf->last_stat_fetch)); UNLOCK(&conf->subvolume_lock); out: return ret; } int32_t dht_inodectx_dump(xlator_t *this, inode_t *inode) { int ret = -1; dht_layout_t *layout = NULL; if (!this) goto out; if (!inode) goto out; ret = dht_inode_ctx_layout_get(inode, this, &layout); if ((ret != 0) || !layout) return ret; gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); dht_layout_dump(layout, "layout"); out: return ret; } void dht_fini(xlator_t *this) { int i = 0; dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO("dht", this, out); conf = this->private; this->private = NULL; if (conf) { if (conf->file_layouts) { for (i = 0; i < conf->subvolume_cnt; i++) { GF_FREE(conf->file_layouts[i]); } GF_FREE(conf->file_layouts); } dict_unref(conf->leaf_to_subvol); /* allocated in dht_init_subvolumes() */ GF_FREE(conf->subvolumes); GF_FREE(conf->subvolume_status); GF_FREE(conf->last_event); GF_FREE(conf->subvol_up_time); GF_FREE(conf->du_stats); GF_FREE(conf->decommissioned_bricks); /* allocated in dht_init() */ GF_FREE(conf->mds_xattr_key); GF_FREE(conf->link_xattr_name); GF_FREE(conf->commithash_xattr_name); GF_FREE(conf->wild_xattr_name); /* allocated in dht_init_regex() */ if (conf->rsync_regex_valid) regfree(&conf->rsync_regex); if (conf->extra_regex_valid) regfree(&conf->extra_regex); synclock_destroy(&conf->link_lock); if (conf->lock_pool) mem_pool_destroy(conf->lock_pool); GF_FREE(conf); } out: return; } int32_t mem_acct_init(xlator_t *this) { int ret = -1; GF_VALIDATE_OR_GOTO("dht", this, out); ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1); if (ret != 0) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, "Memory accounting init failed"); return ret; } out: return ret; } static int dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf, const char *bricks) { int i = 0; int ret = -1; char *tmpstr = NULL; char *dup_brick = NULL; char *node = NULL; if (!conf || !bricks) goto out; dup_brick = gf_strdup(bricks); if (dup_brick == NULL) { goto out; } node = strtok_r(dup_brick, ",", &tmpstr); while (node) { for (i = 0; i < conf->subvolume_cnt; i++) { if (!strcmp(conf->subvolumes[i]->name, node)) { conf->decommissioned_bricks[i] = conf->subvolumes[i]; conf->decommission_subvols_cnt++; gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_DECOMMISSION_INFO, "decommissioning subvolume %s", conf->subvolumes[i]->name); break; } } if (i == conf->subvolume_cnt) { /* Wrong node given. */ goto out; } node = strtok_r(NULL, ",", &tmpstr); } ret = 0; conf->decommission_in_progress = 1; out: GF_FREE(dup_brick); return ret; } static void dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf) { int i = 0; for (i = 0; i < conf->subvolume_cnt; i++) { if (conf->decommissioned_bricks[i]) { conf->decommissioned_bricks[i] = NULL; conf->decommission_subvols_cnt--; } } } static void dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re, gf_boolean_t *re_valid, dht_conf_t *conf) { char *temp_str = NULL; if (dict_get_str(odict, name, &temp_str) != 0) { if (strcmp(name, "rsync-hash-regex")) { return; } temp_str = "^\\.(.+)\\.[^.]+$"; } LOCK(&conf->lock); { if (*re_valid) { regfree(re); *re_valid = _gf_false; } if (!strcmp(temp_str, "none")) { goto unlock; } if (regcomp(re, temp_str, REG_EXTENDED) == 0) { gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str); *re_valid = _gf_true; } else { gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO, "compiling regex %s failed", temp_str); } } unlock: UNLOCK(&conf->lock); } int dht_set_subvol_range(xlator_t *this) { int ret = -1; dht_conf_t *conf = NULL; conf = this->private; if (!conf) goto out; conf->leaf_to_subvol = dict_new(); if (!conf->leaf_to_subvol) goto out; ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol); out: return ret; } static int dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str) { int rebal_thread_count = 0; int ret = 0; pthread_mutex_lock(&conf->defrag->dfq_mutex); { if (!strcasecmp(temp_str, "lazy")) { conf->defrag->recon_thread_count = 1; } else if (!strcasecmp(temp_str, "normal")) { conf->defrag->recon_thread_count = 2; } else if (!strcasecmp(temp_str, "aggressive")) { conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4); } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) { if ((rebal_thread_count > 0) && (rebal_thread_count <= MAX_REBAL_THREADS)) { conf->defrag->recon_thread_count = rebal_thread_count; pthread_mutex_unlock(&conf->defrag->dfq_mutex); gf_msg(this->name, GF_LOG_INFO, 0, 0, "rebal thread count configured to %d", rebal_thread_count); goto out; } else { pthread_mutex_unlock(&conf->defrag->dfq_mutex); gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option: Reconfigure: " "rebal-throttle should be " "within range of 0 and maximum number of" " cores available"); ret = -1; goto out; } } else { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option: Reconfigure: " "rebal-throttle should be {lazy|normal|aggressive}" " or a number up to the number of cores available," " not (%s), defaulting to (%d)", temp_str, conf->dthrottle); ret = -1; } } pthread_mutex_unlock(&conf->defrag->dfq_mutex); out: return ret; } int dht_reconfigure(xlator_t *this, dict_t *options) { dht_conf_t *conf = NULL; char *temp_str = NULL; gf_boolean_t search_unhashed; int ret = -1; GF_VALIDATE_OR_GOTO("dht", this, out); GF_VALIDATE_OR_GOTO("dht", options, out); conf = this->private; if (!conf) return 0; if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) { /* If option is not "auto", other options _should_ be boolean*/ if (strcasecmp(temp_str, "auto")) { if (!gf_string2boolean(temp_str, &search_unhashed)) { gf_msg_debug(this->name, 0, "Reconfigure: " "lookup-unhashed reconfigured(%s)", temp_str); conf->search_unhashed = search_unhashed; } else { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option: Reconfigure: " "lookup-unhashed should be boolean," " not (%s), defaulting to (%d)", temp_str, conf->search_unhashed); ret = -1; goto out; } } else { gf_msg_debug(this->name, 0, "Reconfigure:" " lookup-unhashed reconfigured auto "); conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; } } GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool, out); GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options, percent_or_size, out); /* option can be any one of percent or bytes */ conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent, out); GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options, uint32, out); GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool, out); GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid, options, bool, out); GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options, bool, out); GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool, out); if (conf->defrag) { if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) { ret = dht_configure_throttle(this, conf, temp_str); if (ret == -1) goto out; } } if (conf->defrag) { conf->defrag->lock_migration_enabled = conf->lock_migration_enabled; } if (conf->defrag) { GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool, out); } if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) { ret = dht_parse_decommissioned_bricks(this, conf, temp_str); if (ret == -1) goto out; } else { dht_decommissioned_remove(this, conf); } dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex, &conf->rsync_regex_valid, conf); dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex, &conf->extra_regex_valid, conf); GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool, out); GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out); ret = 0; out: return ret; } static int gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, char *data) { int ret = -1; char *tmp_str = NULL; char *tmp_str1 = NULL; char *dup_str = NULL; char *num = NULL; char *pattern_str = NULL; char *pattern = NULL; gf_defrag_pattern_list_t *temp_list = NULL; gf_defrag_pattern_list_t *pattern_list = NULL; if (!this || !defrag || !data) goto out; /* Get the pattern for pattern list. "pattern:" * eg: *avi, *pdf:10MB, *:1TB */ pattern_str = strtok_r(data, ",", &tmp_str); while (pattern_str) { dup_str = gf_strdup(pattern_str); if (!dup_str) goto out; pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); if (!pattern_list) { goto out; } pattern = strtok_r(dup_str, ":", &tmp_str1); num = strtok_r(NULL, ":", &tmp_str1); if (!pattern) goto out; if (!num) { if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) { pattern = "*"; } } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option. Defrag pattern:" " Invalid number format \"%s\"", num); goto out; } memcpy(pattern_list->path_pattern, pattern, strlen(dup_str)); if (!defrag->defrag_pattern) temp_list = NULL; else temp_list = defrag->defrag_pattern; pattern_list->next = temp_list; defrag->defrag_pattern = pattern_list; pattern_list = NULL; GF_FREE(dup_str); dup_str = NULL; pattern_str = strtok_r(NULL, ",", &tmp_str); } ret = 0; out: if (ret) GF_FREE(pattern_list); GF_FREE(dup_str); return ret; } static int dht_init_methods(xlator_t *this) { int ret = -1; dht_conf_t *conf = NULL; dht_methods_t *methods = NULL; GF_VALIDATE_OR_GOTO("dht", this, err); conf = this->private; methods = &(conf->methods); methods->migration_get_dst_subvol = dht_migration_get_dst_subvol; methods->migration_other = NULL; methods->layout_search = dht_layout_search; ret = 0; err: return ret; } int dht_init(xlator_t *this) { dht_conf_t *conf = NULL; char *temp_str = NULL; int ret = -1; int i = 0; gf_defrag_info_t *defrag = NULL; int cmd = 0; char *node_uuid = NULL; uint32_t commit_hash = 0; GF_VALIDATE_OR_GOTO("dht", this, err); if (!this->children) { gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION, "Distribute needs more than one subvolume"); return -1; } if (!this->parents) { gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION, "dangling volume. check volfile"); } conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t); if (!conf) { goto err; } LOCK_INIT(&conf->subvolume_lock); LOCK_INIT(&conf->layout_lock); LOCK_INIT(&conf->lock); synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT); /* We get the commit-hash to set only for rebalance process */ if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) { gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO, "%s using commit hash %u", __func__, commit_hash); conf->vol_commit_hash = commit_hash; conf->vch_forced = _gf_true; } ret = dict_get_int32(this->options, "rebalance-cmd", &cmd); if (cmd) { defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt); GF_VALIDATE_OR_GOTO(this->name, defrag, err); LOCK_INIT(&defrag->lock); defrag->is_exiting = 0; conf->defrag = defrag; defrag->this = this; ret = dict_get_str(this->options, "node-uuid", &node_uuid); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION, "Invalid volume configuration: " "node-uuid not specified"); goto err; } if (gf_uuid_parse(node_uuid, defrag->node_uuid)) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option:" " Cannot parse glusterd node uuid"); goto err; } defrag->cmd = cmd; defrag->stats = _gf_false; defrag->queue = NULL; defrag->crawl_done = 0; defrag->global_error = 0; defrag->q_entry_count = 0; defrag->wakeup_crawler = 0; pthread_mutex_init(&defrag->dfq_mutex, 0); pthread_cond_init(&defrag->parallel_migration_cond, 0); pthread_cond_init(&defrag->rebalance_crawler_alarm, 0); pthread_cond_init(&defrag->df_wakeup_thread, 0); pthread_mutex_init(&defrag->fc_mutex, 0); pthread_cond_init(&defrag->fc_wakeup_cond, 0); defrag->global_error = 0; } conf->use_fallocate = 1; conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) { /* If option is not "auto", other options _should_ be boolean */ if (strcasecmp(temp_str, "auto")) { gf_boolean_t search_unhashed_bool; ret = gf_string2boolean(temp_str, &search_unhashed_bool); if (ret == -1) { goto err; } conf->search_unhashed = search_unhashed_bool ? GF_DHT_LOOKUP_UNHASHED_ON : GF_DHT_LOOKUP_UNHASHED_OFF; } else { conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; } } GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err); GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err); GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err); GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err); GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err); conf->dir_spread_cnt = conf->subvolume_cnt; GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32, err); GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool, err); GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err); GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err); GF_OPTION_INIT("force-migration", conf->force_migration, bool, err); if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err); if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) { if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, "Invalid option:" " Cannot parse rebalance-filter (%s)", temp_str); goto err; } } } /* option can be any one of percent or bytes */ conf->disk_unit = 0; if (conf->min_free_disk < 100) conf->disk_unit = 'p'; ret = dht_init_subvolumes(this, conf); if (ret == -1) { goto err; } if (cmd) { ret = dht_init_local_subvolumes(this, conf); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, "dht_init_local_subvolumes failed"); goto err; } } if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) { ret = dht_parse_decommissioned_bricks(this, conf, temp_str); if (ret == -1) goto err; } dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex, &conf->rsync_regex_valid, conf); dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex, &conf->extra_regex_valid, conf); ret = dht_layouts_init(this, conf); if (ret == -1) { goto err; } conf->gen = 1; this->local_pool = mem_pool_new(dht_local_t, 512); if (!this->local_pool) { gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, " DHT initialisation failed. " "failed to create local_t's memory pool"); goto err; } GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid, bool, err); if (defrag) { GF_OPTION_INIT("rebal-throttle", temp_str, str, err); if (temp_str) { ret = dht_configure_throttle(this, conf, temp_str); if (ret == -1) goto err; } } GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err); gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name); gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR, conf->xattr_name); gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR, conf->xattr_name); gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name); if (!conf->link_xattr_name || !conf->wild_xattr_name) { goto err; } GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err); conf->lock_pool = mem_pool_new(dht_lock_t, 512); if (!conf->lock_pool) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED, "failed to create lock mem_pool, failing " "initialization"); goto err; } this->private = conf; if (dht_set_subvol_range(this)) goto err; if (dht_init_methods(this)) goto err; return 0; err: if (conf) { if (conf->file_layouts) { for (i = 0; i < conf->subvolume_cnt; i++) { GF_FREE(conf->file_layouts[i]); } GF_FREE(conf->file_layouts); } GF_FREE(conf->subvolumes); GF_FREE(conf->subvolume_status); GF_FREE(conf->du_stats); GF_FREE(conf->defrag); GF_FREE(conf->xattr_name); GF_FREE(conf->link_xattr_name); GF_FREE(conf->wild_xattr_name); GF_FREE(conf->mds_xattr_key); if (conf->lock_pool) mem_pool_destroy(conf->lock_pool); GF_FREE(conf); } return -1; } struct volume_options dht_options[] = { { .key = {"lookup-unhashed"}, .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on", "off"}, .type = GF_OPTION_TYPE_STR, .default_value = "on", .description = "This option if set to ON, does a lookup through " "all the sub-volumes, in case a lookup didn't return any result " "from the hash subvolume. If set to OFF, it does not do a lookup " "on the remaining subvolumes.", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, .level = OPT_STATUS_BASIC, }, {.key = {"lookup-optimize"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .description = "This option if set to ON enables the optimization " "of -ve lookups, by not doing a lookup on non-hashed subvolumes for " "files, in case the hashed subvolume does not return any result. " "This option disregards the lookup-unhashed setting, when enabled.", .op_version = {GD_OP_VERSION_3_7_2}, .level = OPT_STATUS_ADVANCED, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"min-free-disk"}, .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, .default_value = "10%", .description = "Percentage/Size of disk space, after which the " "process starts balancing out the cluster, and logs will appear " "in log files", .op_version = {1}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", .description = "after system has only N% of inodes, warnings " "starts to appear in log files", .op_version = {1}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, { .key = {"unhashed-sticky-bit"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, {.key = {"use-readdirp"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .description = "This option if set to ON, forces the use of " "readdirp, and hence also displays the stats of the files.", .level = OPT_STATUS_ADVANCED, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"assert-no-child-down"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option if set to ON, in the event of " "CHILD_DOWN, will call exit."}, { .key = {"directory-layout-spread"}, .type = GF_OPTION_TYPE_INT, .min = 1, .validate = GF_OPT_VALIDATE_MIN, .description = "Specifies the directory layout spread. Takes number " "of subvolumes as default value.", .op_version = {2}, }, { .key = {"decommissioned-bricks"}, .type = GF_OPTION_TYPE_ANY, .description = "This option if set to ON, decommissions " "the brick, so that no new data is allowed to be created " "on that brick.", .level = OPT_STATUS_ADVANCED, }, { .key = {"rebalance-cmd"}, .type = GF_OPTION_TYPE_INT, }, { .key = {"commit-hash"}, .type = GF_OPTION_TYPE_INT, }, { .key = {"node-uuid"}, .type = GF_OPTION_TYPE_STR, }, { .key = {"rebalance-stats"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option if set to ON displays and logs the " " time taken for migration of each file, during the rebalance " "process. If set to OFF, the rebalance logs will only display the " "time spent in each directory.", .op_version = {2}, .level = OPT_STATUS_BASIC, }, {.key = {"readdir-optimize"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option if set to ON enables the optimization " "that allows DHT to requests non-first subvolumes to filter out " "directory entries.", .op_version = {1}, .level = OPT_STATUS_ADVANCED, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"rsync-hash-regex"}, .type = GF_OPTION_TYPE_STR, /* Setting a default here doesn't work. See dht_init_regex. */ .description = "Regular expression for stripping temporary-file " "suffix and prefix used by rsync, to prevent relocation when the " "file is renamed.", .op_version = {3}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"extra-hash-regex"}, .type = GF_OPTION_TYPE_STR, /* Setting a default here doesn't work. See dht_init_regex. */ .description = "Regular expression for stripping temporary-file " "suffix and prefix used by an application, to prevent relocation when " "the file is renamed.", .op_version = {3}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, { .key = {"rebalance-filter"}, .type = GF_OPTION_TYPE_STR, }, { .key = {"xattr-name"}, .type = GF_OPTION_TYPE_STR, .default_value = "trusted.glusterfs.dht", .description = "Base for extended attributes used by this " "translator instance, to avoid conflicts with others above or " "below it.", .op_version = {3}, }, {.key = {"weighted-rebalance"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .description = "When enabled, files will be allocated to bricks " "with a probability proportional to their size. Otherwise, all " "bricks will have the same probability (legacy behavior).", .op_version = {GD_OP_VERSION_3_6_0}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, /* NUFA option */ {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR}, /* switch option */ {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY}, { .key = {"randomize-hash-range-by-gfid"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "Use gfid of directory to determine the subvolume " "from which hash ranges are allocated starting with 0. " "Note that we still use a directory/file's name to determine the " "subvolume to which it hashes", .op_version = {GD_OP_VERSION_3_6_0}, }, {.key = {"rebal-throttle"}, .type = GF_OPTION_TYPE_STR, .default_value = "normal", .description = " Sets the maximum number of parallel file migrations " "allowed on a node during the rebalance operation. The" " default value is normal and allows a max of " "[($(processing units) - 4) / 2), 2] files to be " "migrated at a time. Lazy will allow only one file to " "be migrated at a time and aggressive will allow " "max of [($(processing units) - 4) / 2), 4]", .op_version = {GD_OP_VERSION_3_7_0}, .level = OPT_STATUS_BASIC, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC }, {.key = {"lock-migration"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = " If enabled this feature will migrate the posix locks" " associated with a file during rebalance", .op_version = {GD_OP_VERSION_3_8_0}, .level = OPT_STATUS_ADVANCED, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {"force-migration"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "If disabled, rebalance will not migrate files that " "are being written to by an application", .op_version = {GD_OP_VERSION_4_0_0}, .level = OPT_STATUS_ADVANCED, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, {.key = {NULL}}, }; #define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0])) extern struct volume_options options[NUM_DHT_OPTIONS] __attribute__((alias("dht_options")));