From 896fc241850aaa021f6f8958da4e37e37679c0cd Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 15 Nov 2011 13:44:43 -0800 Subject: cluster/distribute: Add support for 'min-free-inodes' on each distribute subvolume. This change is required as increasingly large number of small files would cause inodes to run out before they run out on available disk space. It is highly necessary to support algorithmic checking of inodes too just as we do for disk space. Change-Id: I9b87405328d443825e239ee80ab664aceb50ee68 BUG: 3799 Signed-off-by: Harshavardhana Reviewed-on: http://review.gluster.com/730 Tested-by: Gluster Build System Reviewed-by: Jeff Darcy Reviewed-by: Amar Tumballi --- xlators/cluster/dht/src/dht-common.c | 2 +- xlators/cluster/dht/src/dht-common.h | 4 +- xlators/cluster/dht/src/dht-diskusage.c | 432 ++++++++++++++++++-------------- xlators/cluster/dht/src/dht.c | 19 +- 4 files changed, 257 insertions(+), 200 deletions(-) (limited to 'xlators/cluster/dht/src') diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 7508684aaf2..fb149e7635e 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3203,7 +3203,7 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; int this_call_cnt = 0; int ret = -1; - int subvol_filled = 0; + gf_boolean_t subvol_filled = _gf_false; call_frame_t *prev = NULL; dht_layout_t *layout = NULL; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 8af6dbdcdfd..54cef6cd9d4 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -169,6 +169,7 @@ typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; + double avail_inodes; uint64_t avail_space; uint32_t log; }; @@ -186,6 +187,7 @@ struct dht_conf { int gen; dht_du_t *du_stats; uint64_t min_free_disk; + uint32_t min_free_inodes; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -355,7 +357,7 @@ int dht_rename (call_frame_t *frame, xlator_t *this, int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 0b8c116ca40..5453e3b107b 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -35,227 +35,269 @@ int dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs) + int op_ret, int op_errno, struct statvfs *statvfs) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bavail * statvfs->f_frsize); - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - gf_log (this->name, GF_LOG_TRACE, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64"", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "failed to get disk info from %s", prev->this->name); + goto out; + } + + if (statvfs && statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + } + + if (statvfs && statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* set percent inodes to 100 for dynamically allocated inode filesystems + this logic holds good so that, distribute has nothing to worry about + total inodes rather let the 'create()' to be scheduled on the hashed + subvol regardless of the total inodes. since we have no awareness on + loosing inodes this logic fits well + */ + percent_inodes = 100; + } + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev->this == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + gf_log (this->name, GF_LOG_DEBUG, + "on subvolume '%s': avail_percent is: " + "%.2f and avail_space is: %"PRIu64" " + "and avail_inodes is: %.2f", + prev->this->name, + conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + } + } + UNLOCK (&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_DESTROY (frame); - return 0; + return 0; } int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; - - conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - goto err; - } - - /* local->fop value is not used in this case */ - statfs_local = dht_local_init (statfs_frame, NULL, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc); - - return 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame (this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init (statfs_frame, NULL, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + loc_t tmp_loc = { .inode = NULL, + .path = "/", + }; + + statfs_local->call_cnt = 1; + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, + &tmp_loc); + + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; - - conf = this->private; - - gettimeofday (&tv, NULL); - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; - } - - /* In this case, 'local->fop' is not used */ - statfs_local = dht_local_init (statfs_frame, loc, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - loc_t tmp_loc = { .inode = NULL, - .path = "/", - }; - - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc); - } - - conf->last_stat_fetch.tv_sec = tv.tv_sec; - } - return 0; + int i = 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + struct timeval tv = {0,}; + + conf = this->private; + + gettimeofday (&tv, NULL); + if (tv.tv_sec > (conf->refresh_interval + + conf->last_stat_fetch.tv_sec)) { + + statfs_frame = copy_frame (frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init (statfs_frame, loc, NULL, + GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + loc_t tmp_loc = { .inode = NULL, + .path = "/", + }; + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, + &tmp_loc); + } + + conf->last_stat_fetch.tv_sec = tv.tv_sec; + } + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY (statfs_frame); - return -1; + return -1; } -int +gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) { - int i = 0; - int subvol_filled = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled = 1; - break; - } - } - } - } - } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); - } - } - - return subvol_filled; + int i = 0; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < + conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < + conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_WARNING, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_percent)); + } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + gf_log (this->name, GF_LOG_CRITICAL, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more nodes", + subvol->name, + (100 - conf->du_stats[i].avail_inodes)); + } + } + + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; } xlator_t * dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) { - int i = 0; - double max= 0; - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - - conf = this->private; - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent > max) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if (conf->du_stats[i].avail_space > max) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } - } - } - } - UNLOCK (&conf->subvolume_lock); - - if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space to create"); - } - - if (max < conf->min_free_disk) - avail_subvol = subvol; - - if (!avail_subvol) - avail_subvol = subvol; - - return avail_subvol; + int i = 0; + double max = 0; + double max_inodes = 0; + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > max_inodes)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > max_inodes)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + + } + } + } + UNLOCK (&conf->subvolume_lock); + + if (!avail_subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume has enough free space and inodes to create"); + } + + if ((max < conf->min_free_disk) && (max_inodes < conf->min_free_inodes)) + avail_subvol = subvol; + + if (!avail_subvol) + avail_subvol = subvol; + + return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index c5bb36be05c..8be573f5165 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -131,6 +131,7 @@ dht_priv_dump (xlator_t *this) gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); gf_proc_dump_write("gen", "%d", conf->gen); gf_proc_dump_write("min_free_disk", "%lu", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lu", conf->min_free_inodes); gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); @@ -139,6 +140,8 @@ dht_priv_dump (xlator_t *this) conf->du_stats->avail_percent); gf_proc_dump_write("du_stats.avail_space", "%lu", conf->du_stats->avail_space); + gf_proc_dump_write("du_stats.avail_inodes", "%lf", + conf->du_stats->avail_inodes); gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); } gf_proc_dump_write("last_stat_fetch", "%s", ctime(&conf->last_stat_fetch.tv_sec)); @@ -318,9 +321,10 @@ reconfigure (xlator_t *this, dict_t *options) } } - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, percent_or_size, out); - + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, options, uint32, out); @@ -376,7 +380,10 @@ init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); conf->dir_spread_cnt = conf->subvolume_cnt; @@ -517,6 +524,12 @@ struct volume_options options[] = { .description = "Percentage/Size of disk space that must be " "kept free." }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "Percentage inodes that must be " + "kept free." + }, { .key = {"unhashed-sticky-bit"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", -- cgit