diff options
Diffstat (limited to 'scheduler/alu')
-rw-r--r-- | scheduler/alu/Makefile.am | 3 | ||||
-rw-r--r-- | scheduler/alu/src/Makefile.am | 14 | ||||
-rw-r--r-- | scheduler/alu/src/alu.c | 993 | ||||
-rw-r--r-- | scheduler/alu/src/alu.h | 89 |
4 files changed, 1099 insertions, 0 deletions
diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/alu/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am new file mode 100644 index 00000000000..eb7d0db07e0 --- /dev/null +++ b/scheduler/alu/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = alu.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +alu_la_LDFLAGS = -module -avoidversion + +alu_la_SOURCES = alu.c +alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = alu.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c new file mode 100644 index 00000000000..754c5e35352 --- /dev/null +++ b/scheduler/alu/src/alu.c @@ -0,0 +1,993 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + + +/* ALU code needs a complete re-write. This is one of the most important + * part of GlusterFS and so needs more and more reviews and testing + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> +#include <stdint.h> +#include "stack.h" +#include "alu.h" + +#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT (1 * GF_UNIT_GB) +#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT (512 * GF_UNIT_MB) + +#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT 1000 +#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT 100 + +#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT 100 + +#define ALU_REFRESH_INTERVAL_DEFAULT 5 +#define ALU_REFRESH_CREATE_COUNT_DEFAULT 5 + + +static int64_t +get_stats_disk_usage (struct xlator_stats *this) +{ + return this->disk_usage; +} + +static int64_t +get_stats_write_usage (struct xlator_stats *this) +{ + return this->write_usage; +} + +static int64_t +get_stats_read_usage (struct xlator_stats *this) +{ + return this->read_usage; +} + +static int64_t +get_stats_disk_speed (struct xlator_stats *this) +{ + return this->disk_speed; +} + +static int64_t +get_stats_file_usage (struct xlator_stats *this) +{ + /* Avoid warning "defined but not used" */ + (void) &get_stats_file_usage; + + return this->nr_files; +} + +static int64_t +get_stats_free_disk (struct xlator_stats *this) +{ + if (this->total_disk_size > 0) + return (this->free_disk * 100) / this->total_disk_size; + return 0; +} + +static int64_t +get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->write_usage - min->write_usage); +} + +static int64_t +get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->read_usage - min->read_usage); +} + +static int64_t +get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_usage - min->disk_usage); +} + +static int64_t +get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_speed - min->disk_speed); +} + +static int64_t +get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->nr_files - min->nr_files); +} + + +int +alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched) +{ + data_t *order = dict_get (xl->options, "scheduler.alu.order"); + if (!order) { + gf_log (xl->name, GF_LOG_ERROR, + "option 'scheduler.alu.order' not specified"); + return -1; + } + struct alu_threshold *_threshold_fn; + struct alu_threshold *tmp_threshold; + data_t *entry_fn = NULL; + data_t *exit_fn = NULL; + char *tmp_str = NULL; + char *order_str = strtok_r (order->data, ":", &tmp_str); + /* Get the scheduling priority order, specified by the user. */ + while (order_str) { + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: order string: %s", + order_str); + if (strcmp (order_str, "disk-usage") == 0) { + /* Disk usage */ + _threshold_fn = + CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_usage; + _threshold_fn->sched_value = get_stats_disk_usage; + entry_fn = + dict_get (xl->options, + "scheduler.alu.disk-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_disk_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_disk_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", + GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.disk_usage, + alu_sched->exit_limit.disk_usage); + + } else if (strcmp (order_str, "write-usage") == 0) { + /* Handle "write-usage" */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_write_usage; + _threshold_fn->sched_value = get_stats_write_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.write-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of option scheduler.alu." + "write-usage.entry-threshold", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_write_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.write-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\"" + " of \"option scheduler.alu." + "write-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_write_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log (xl->name, GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.write_usage, + alu_sched->exit_limit.write_usage); + + } else if (strcmp (order_str, "read-usage") == 0) { + /* Read usage */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_read_usage; + _threshold_fn->sched_value = get_stats_read_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.read-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.read_usage) != 0) { + gf_log (xl->name, + GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_read_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.read-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.read_usage) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_read_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.read_usage, + alu_sched->exit_limit.read_usage); + + } else if (strcmp (order_str, "open-files-usage") == 0) { + /* Open files counter */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_file_usage; + _threshold_fn->sched_value = get_stats_file_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2uint64 (entry_fn->data, + &alu_sched->entry_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.entry-" + "threshold\"", entry_fn->data); + return -1; + } + } + else + { + alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_file_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2uint64 (exit_fn->data, + &alu_sched->exit_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.exit-" + "threshold\"", exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_file_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu.c->alu_init: = %"PRIu64",%"PRIu64"", + alu_sched->entry_limit.nr_files, + alu_sched->exit_limit.nr_files); + + } else if (strcmp (order_str, "disk-speed-usage") == 0) { + /* Disk speed */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_speed; + _threshold_fn->sched_value = get_stats_disk_speed; + entry_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.entry-threshold"); + if (entry_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "entry-threshold is given, " + "value is constant"); + } + _threshold_fn->entry_value = NULL; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.exit-threshold"); + if (exit_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "exit-threshold is given, " + "value is constant"); + } + _threshold_fn->exit_value = NULL; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + + } else { + gf_log ("alu", GF_LOG_DEBUG, + "%s, unknown option provided to scheduler", + order_str); + } + order_str = strtok_r (NULL, ":", &tmp_str); + } + + return 0; +} + +static int32_t +alu_init (xlator_t *xl) +{ + struct alu_sched *alu_sched = NULL; + struct alu_limits *_limit_fn = NULL; + struct alu_limits *tmp_limits = NULL; + uint32_t min_free_disk = 0; + data_t *limits = NULL; + + alu_sched = CALLOC (1, sizeof (struct alu_sched)); + ERR_ABORT (alu_sched); + + { + alu_parse_options (xl, alu_sched); + } + + /* Get the limits */ + + limits = dict_get (xl->options, + "scheduler.limits.min-free-disk"); + if (limits) { + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->min_value = get_stats_free_disk; + _limit_fn->cur_value = get_stats_free_disk; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + + if (gf_string2percent (limits->data, + &min_free_disk) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.limits." + "min-free-disk\"", limits->data); + return -1; + } + alu_sched->spec_limit.free_disk = min_free_disk; + + if (alu_sched->spec_limit.free_disk >= 100) { + gf_log ("alu", GF_LOG_ERROR, + "check the \"option scheduler." + "limits.min-free-disk\", it should " + "be percentage value"); + return -1; + } + alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */ + gf_log ("alu", GF_LOG_DEBUG, + "alu.limit.min-disk-free = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + limits = dict_get (xl->options, + "scheduler.limits.max-open-files"); + if (limits) { + // Update alu_sched->priority properly + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->max_value = get_stats_file_usage; + _limit_fn->cur_value = get_stats_file_usage; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + if (gf_string2uint64_base10 (limits->data, + &alu_sched->spec_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format '%s' of option " + "scheduler.limits.max-open-files", + limits->data); + return -1; + } + + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: limit.max-open-files = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + + /* Stats refresh options */ + limits = dict_get (xl->options, + "scheduler.refresh-interval"); + if (limits) { + if (gf_string2time (limits->data, + &alu_sched->refresh_interval) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "option scheduler.refresh-interval", + limits->data); + return -1; + } + } else { + alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT; + } + gettimeofday (&(alu_sched->last_stat_fetch), NULL); + + + limits = dict_get (xl->options, + "scheduler.alu.stat-refresh.num-file-create"); + if (limits) { + if (gf_string2uint32 (limits->data, + &alu_sched->refresh_create_count) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of \"option " + "alu.stat-refresh.num-file-create\"", + limits->data); + return -1; + } + } else { + alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT; + } + + { + /* Build an array of child_nodes */ + struct alu_sched_struct *sched_array = NULL; + xlator_list_t *trav_xl = xl->children; + data_t *data = NULL; + int32_t index = 0; + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + alu_sched->child_count = index; + sched_array = CALLOC (index, sizeof (struct alu_sched_struct)); + ERR_ABORT (sched_array); + trav_xl = xl->children; + index = 0; + while (trav_xl) { + sched_array[index].xl = trav_xl->xlator; + sched_array[index].eligible = 1; + index++; + trav_xl = trav_xl->next; + } + alu_sched->array = sched_array; + + data = dict_get (xl->options, + "scheduler.read-only-subvolumes"); + if (data) { + char *child = NULL; + char *tmp = NULL; + char *childs_data = strdup (data->data); + + child = strtok_r (childs_data, ",", &tmp); + while (child) { + for (index = 1; index < alu_sched->child_count; index++) { + if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) { + memcpy (&(alu_sched->array[index -1]), + &(alu_sched->array[alu_sched->child_count -1]), + sizeof (struct alu_sched_struct)); + alu_sched->child_count--; + break; + } + } + child = strtok_r (NULL, ",", &tmp); + } + } + } + + *((long *)xl->private) = (long)alu_sched; + + /* Initialize all the alu_sched structure's elements */ + { + alu_sched->sched_nodes_pending = 0; + + alu_sched->min_limit.free_disk = 0x00FFFFFF; + alu_sched->min_limit.disk_usage = 0xFFFFFFFF; + alu_sched->min_limit.total_disk_size = 0xFFFFFFFF; + alu_sched->min_limit.disk_speed = 0xFFFFFFFF; + alu_sched->min_limit.write_usage = 0xFFFFFFFF; + alu_sched->min_limit.read_usage = 0xFFFFFFFF; + alu_sched->min_limit.nr_files = 0xFFFFFFFF; + alu_sched->min_limit.nr_clients = 0xFFFFFFFF; + } + + pthread_mutex_init (&alu_sched->alu_mutex, NULL); + return 0; +} + +static void +alu_fini (xlator_t *xl) +{ + if (!xl) + return; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limit = alu_sched->limits_fn; + struct alu_threshold *threshold = alu_sched->threshold_fn; + void *tmp = NULL; + pthread_mutex_destroy (&alu_sched->alu_mutex); + free (alu_sched->array); + while (limit) { + tmp = limit; + limit = limit->next; + free (tmp); + } + while (threshold) { + tmp = threshold; + threshold = threshold->next; + free (tmp); + } + free (alu_sched); +} + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limits_fn = alu_sched->limits_fn; + int32_t idx = 0; + + pthread_mutex_lock (&alu_sched->alu_mutex); + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)cookie) + break; + } + pthread_mutex_unlock (&alu_sched->alu_mutex); + + if (op_ret == -1) { + alu_sched->array[idx].eligible = 0; + } else { + memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats)); + + /* Get stats from all the child node */ + /* Here check the limits specified by the user to + consider the nodes to be used by scheduler */ + alu_sched->array[idx].eligible = 1; + limits_fn = alu_sched->limits_fn; + while (limits_fn){ + if (limits_fn->max_value && + (limits_fn->cur_value (trav_stats) > + limits_fn->max_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + if (limits_fn->min_value && + (limits_fn->cur_value (trav_stats) < + limits_fn->min_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + limits_fn = limits_fn->next; + } + + /* Select minimum and maximum disk_usage */ + if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) { + alu_sched->max_limit.disk_usage = trav_stats->disk_usage; + } + if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) { + alu_sched->min_limit.disk_usage = trav_stats->disk_usage; + } + + /* Select minimum and maximum disk_speed */ + if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) { + alu_sched->max_limit.disk_speed = trav_stats->disk_speed; + } + if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) { + alu_sched->min_limit.disk_speed = trav_stats->disk_speed; + } + + /* Select minimum and maximum number of open files */ + if (trav_stats->nr_files > alu_sched->max_limit.nr_files) { + alu_sched->max_limit.nr_files = trav_stats->nr_files; + } + if (trav_stats->nr_files < alu_sched->min_limit.nr_files) { + alu_sched->min_limit.nr_files = trav_stats->nr_files; + } + + /* Select minimum and maximum write-usage */ + if (trav_stats->write_usage > alu_sched->max_limit.write_usage) { + alu_sched->max_limit.write_usage = trav_stats->write_usage; + } + if (trav_stats->write_usage < alu_sched->min_limit.write_usage) { + alu_sched->min_limit.write_usage = trav_stats->write_usage; + } + + /* Select minimum and maximum read-usage */ + if (trav_stats->read_usage > alu_sched->max_limit.read_usage) { + alu_sched->max_limit.read_usage = trav_stats->read_usage; + } + if (trav_stats->read_usage < alu_sched->min_limit.read_usage) { + alu_sched->min_limit.read_usage = trav_stats->read_usage; + } + + /* Select minimum and maximum free-disk */ + if (trav_stats->free_disk > alu_sched->max_limit.free_disk) { + alu_sched->max_limit.free_disk = trav_stats->free_disk; + } + if (trav_stats->free_disk < alu_sched->min_limit.free_disk) { + alu_sched->min_limit.free_disk = trav_stats->free_disk; + } + } + + STACK_DESTROY (frame->root); + + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t idx = 0; + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + for (idx = 0 ; idx < alu_sched->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + alu_sched->array[idx].xl, //cookie + alu_sched->array[idx].xl, + (alu_sched->array[idx].xl)->mops->stats, + 0); //flag + } + return; +} + +static void +alu_update (xlator_t *xl) +{ + struct timeval tv; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + + gettimeofday (&tv, NULL); + if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) { + /* Update the stats from all the server */ + update_stat_array (xl); + alu_sched->last_stat_fetch.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +alu_scheduler (xlator_t *xl, const void *path) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t sched_index = 0; + int32_t sched_index_orig = 0; + int32_t idx = 0; + + alu_update (xl); + + /* Now check each threshold one by one if some nodes are classified */ + { + struct alu_threshold *trav_threshold = alu_sched->threshold_fn; + struct alu_threshold *tmp_threshold = alu_sched->sched_method; + struct alu_sched_node *tmp_sched_node; + + /* This pointer 'trav_threshold' contains function pointers according to spec file + give by user, */ + while (trav_threshold) { + /* This check is needed for seeing if already there are nodes in this criteria + to be scheduled */ + if (!alu_sched->sched_nodes_pending) { + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (!alu_sched->array[idx].eligible) { + continue; + } + if (trav_threshold->entry_value) { + if (trav_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[idx].stats)) < + trav_threshold->entry_value (&(alu_sched->entry_limit))) { + continue; + } + } + tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node)); + ERR_ABORT (tmp_sched_node); + tmp_sched_node->index = idx; + if (!alu_sched->sched_node) { + alu_sched->sched_node = tmp_sched_node; + } else { + pthread_mutex_lock (&alu_sched->alu_mutex); + tmp_sched_node->next = alu_sched->sched_node; + alu_sched->sched_node = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + alu_sched->sched_nodes_pending++; + } + } /* end of if (sched_nodes_pending) */ + + /* This loop is required to check the eligible nodes */ + struct alu_sched_node *trav_sched_node; + while (alu_sched->sched_nodes_pending) { + trav_sched_node = alu_sched->sched_node; + sched_index = trav_sched_node->index; + if (alu_sched->array[sched_index].eligible) + break; + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + alu_sched->sched_nodes_pending--; + } + if (alu_sched->sched_nodes_pending) { + /* There are some node in this criteria to be scheduled, no need + * to sort and check other methods + */ + if (tmp_threshold && tmp_threshold->exit_value) { + /* verify the exit value && whether node is eligible or not */ + if (tmp_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[sched_index].stats)) > + tmp_threshold->exit_value (&(alu_sched->exit_limit))) { + /* Free the allocated info for the node :) */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } else { + /* if there is no exit value, then exit after scheduling once */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + + alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */ + + /* */ + if (trav_sched_node) { + tmp_sched_node = trav_sched_node; + while (trav_sched_node->next) { + trav_sched_node = trav_sched_node->next; + } + if (tmp_sched_node->next) { + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = tmp_sched_node->next; + tmp_sched_node->next = NULL; + trav_sched_node->next = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } + /* return the scheduled node */ + return alu_sched->array[sched_index].xl; + } /* end of if (pending_nodes) */ + + tmp_threshold = trav_threshold; + trav_threshold = trav_threshold->next; + } + } + + /* This is used only when there is everything seems ok, or no eligible nodes */ + sched_index_orig = alu_sched->sched_index; + alu_sched->sched_method = NULL; + while (1) { + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + sched_index = alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + if (alu_sched->array[sched_index].eligible) + break; + if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) { + gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule"); + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + break; + } + } + return alu_sched->array[sched_index].xl; +} + +/** + * notify + */ +void +alu_notify (xlator_t *xl, int32_t event, void *data) +{ + struct alu_sched *alu_sched = NULL; + int32_t idx = 0; + + alu_sched = (struct alu_sched *)*((long *)xl->private); + if (!alu_sched) + return; + + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //alu_sched->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + alu_sched->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = alu_init, + .fini = alu_fini, + .update = alu_update, + .schedule = alu_scheduler, + .notify = alu_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.alu.order", "alu.order" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.alu.disk-usage.entry-threshold", + "alu.disk-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.disk-usage.exit-threshold", + "alu.disk-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.entry-threshold", + "alu.write-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.exit-threshold", + "alu.write-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.entry-threshold", + "alu.read-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.exit-threshold", + "alu.read-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.open-files-usage.entry-threshold", + "alu.open-files-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.alu.open-files-usage.exit-threshold", + "alu.open-files-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.read-only-subvolumes", + "alu.read-only-subvolumes" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.refresh-interval", + "alu.refresh-interval", + "alu.stat-refresh.interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "alu.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.alu.stat-refresh.num-file-create" + "alu.stat-refresh.num-file-create"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {NULL}, } +}; diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h new file mode 100644 index 00000000000..a958bb4d298 --- /dev/null +++ b/scheduler/alu/src/alu.h @@ -0,0 +1,89 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _ALU_H +#define _ALU_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" + +struct alu_sched; + +struct alu_sched_struct { + xlator_t *xl; + struct xlator_stats stats; + unsigned char eligible; +}; + +// Write better name for these functions +struct alu_limits { + struct alu_limits *next; + int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */ + int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */ + int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */ +}; + +struct alu_threshold { + struct alu_threshold *next; + int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */ + int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */ + int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */ + int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */ +}; + +struct alu_sched_node { + struct alu_sched_node *next; + int32_t index; +}; + +struct alu_sched { + struct alu_limits *limits_fn; + struct alu_threshold *threshold_fn; + struct alu_sched_struct *array; + struct alu_sched_node *sched_node; + struct alu_threshold *sched_method; + struct xlator_stats max_limit; + struct xlator_stats min_limit; + struct xlator_stats entry_limit; + struct xlator_stats exit_limit; + struct xlator_stats spec_limit; /* User given limit */ + + pthread_mutex_t alu_mutex; + struct timeval last_stat_fetch; + uint32_t refresh_interval; /* in seconds */ + uint32_t refresh_create_count; /* num-file-create */ + + int32_t sched_nodes_pending; + + int32_t sched_index; /* used for round robin scheduling in case of many nodes getting the criteria match. */ + int32_t child_count; + +}; + +struct _alu_local_t { + int32_t call_count; +}; + +typedef struct _alu_local_t alu_local_t; + +#endif /* _ALU_H */ |