From 77adf4cd648dce41f89469dd185deec6b6b53a0b Mon Sep 17 00:00:00 2001 From: Vikas Gorur Date: Wed, 18 Feb 2009 17:36:07 +0530 Subject: Added all files --- scheduler/Makefile.am | 3 + scheduler/alu/Makefile.am | 3 + scheduler/alu/src/Makefile.am | 14 + scheduler/alu/src/alu.c | 993 +++++++++++++++++++++++++++++++++++++++ scheduler/alu/src/alu.h | 89 ++++ scheduler/nufa/Makefile.am | 3 + scheduler/nufa/src/Makefile.am | 12 + scheduler/nufa/src/nufa.c | 403 ++++++++++++++++ scheduler/random/Makefile.am | 3 + scheduler/random/src/Makefile.am | 14 + scheduler/random/src/random.c | 283 +++++++++++ scheduler/random/src/random.h | 46 ++ scheduler/rr/Makefile.am | 3 + scheduler/rr/src/Makefile.am | 13 + scheduler/rr/src/rr-options.c | 256 ++++++++++ scheduler/rr/src/rr-options.h | 34 ++ scheduler/rr/src/rr.c | 565 ++++++++++++++++++++++ scheduler/rr/src/rr.h | 70 +++ scheduler/switch/Makefile.am | 3 + scheduler/switch/src/Makefile.am | 12 + scheduler/switch/src/switch.c | 398 ++++++++++++++++ 21 files changed, 3220 insertions(+) create mode 100644 scheduler/Makefile.am create mode 100644 scheduler/alu/Makefile.am create mode 100644 scheduler/alu/src/Makefile.am create mode 100644 scheduler/alu/src/alu.c create mode 100644 scheduler/alu/src/alu.h create mode 100644 scheduler/nufa/Makefile.am create mode 100644 scheduler/nufa/src/Makefile.am create mode 100644 scheduler/nufa/src/nufa.c create mode 100644 scheduler/random/Makefile.am create mode 100644 scheduler/random/src/Makefile.am create mode 100644 scheduler/random/src/random.c create mode 100644 scheduler/random/src/random.h create mode 100644 scheduler/rr/Makefile.am create mode 100644 scheduler/rr/src/Makefile.am create mode 100644 scheduler/rr/src/rr-options.c create mode 100644 scheduler/rr/src/rr-options.h create mode 100644 scheduler/rr/src/rr.c create mode 100644 scheduler/rr/src/rr.h create mode 100644 scheduler/switch/Makefile.am create mode 100644 scheduler/switch/src/Makefile.am create mode 100644 scheduler/switch/src/switch.c (limited to 'scheduler') diff --git a/scheduler/Makefile.am b/scheduler/Makefile.am new file mode 100644 index 000000000..618fa7dd8 --- /dev/null +++ b/scheduler/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = alu random nufa rr switch + +CLEANFILES = diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/alu/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am new file mode 100644 index 000000000..eb7d0db07 --- /dev/null +++ b/scheduler/alu/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = alu.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +alu_la_LDFLAGS = -module -avoidversion + +alu_la_SOURCES = alu.c +alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = alu.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c new file mode 100644 index 000000000..754c5e353 --- /dev/null +++ b/scheduler/alu/src/alu.c @@ -0,0 +1,993 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + + +/* ALU code needs a complete re-write. This is one of the most important + * part of GlusterFS and so needs more and more reviews and testing + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include "stack.h" +#include "alu.h" + +#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT (1 * GF_UNIT_GB) +#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT (512 * GF_UNIT_MB) + +#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT 1000 +#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT 100 + +#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT 100 + +#define ALU_REFRESH_INTERVAL_DEFAULT 5 +#define ALU_REFRESH_CREATE_COUNT_DEFAULT 5 + + +static int64_t +get_stats_disk_usage (struct xlator_stats *this) +{ + return this->disk_usage; +} + +static int64_t +get_stats_write_usage (struct xlator_stats *this) +{ + return this->write_usage; +} + +static int64_t +get_stats_read_usage (struct xlator_stats *this) +{ + return this->read_usage; +} + +static int64_t +get_stats_disk_speed (struct xlator_stats *this) +{ + return this->disk_speed; +} + +static int64_t +get_stats_file_usage (struct xlator_stats *this) +{ + /* Avoid warning "defined but not used" */ + (void) &get_stats_file_usage; + + return this->nr_files; +} + +static int64_t +get_stats_free_disk (struct xlator_stats *this) +{ + if (this->total_disk_size > 0) + return (this->free_disk * 100) / this->total_disk_size; + return 0; +} + +static int64_t +get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->write_usage - min->write_usage); +} + +static int64_t +get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->read_usage - min->read_usage); +} + +static int64_t +get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_usage - min->disk_usage); +} + +static int64_t +get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_speed - min->disk_speed); +} + +static int64_t +get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->nr_files - min->nr_files); +} + + +int +alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched) +{ + data_t *order = dict_get (xl->options, "scheduler.alu.order"); + if (!order) { + gf_log (xl->name, GF_LOG_ERROR, + "option 'scheduler.alu.order' not specified"); + return -1; + } + struct alu_threshold *_threshold_fn; + struct alu_threshold *tmp_threshold; + data_t *entry_fn = NULL; + data_t *exit_fn = NULL; + char *tmp_str = NULL; + char *order_str = strtok_r (order->data, ":", &tmp_str); + /* Get the scheduling priority order, specified by the user. */ + while (order_str) { + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: order string: %s", + order_str); + if (strcmp (order_str, "disk-usage") == 0) { + /* Disk usage */ + _threshold_fn = + CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_usage; + _threshold_fn->sched_value = get_stats_disk_usage; + entry_fn = + dict_get (xl->options, + "scheduler.alu.disk-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_disk_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_disk_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", + GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.disk_usage, + alu_sched->exit_limit.disk_usage); + + } else if (strcmp (order_str, "write-usage") == 0) { + /* Handle "write-usage" */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_write_usage; + _threshold_fn->sched_value = get_stats_write_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.write-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of option scheduler.alu." + "write-usage.entry-threshold", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_write_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.write-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\"" + " of \"option scheduler.alu." + "write-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_write_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log (xl->name, GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.write_usage, + alu_sched->exit_limit.write_usage); + + } else if (strcmp (order_str, "read-usage") == 0) { + /* Read usage */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_read_usage; + _threshold_fn->sched_value = get_stats_read_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.read-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.read_usage) != 0) { + gf_log (xl->name, + GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_read_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.read-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.read_usage) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_read_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.read_usage, + alu_sched->exit_limit.read_usage); + + } else if (strcmp (order_str, "open-files-usage") == 0) { + /* Open files counter */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_file_usage; + _threshold_fn->sched_value = get_stats_file_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2uint64 (entry_fn->data, + &alu_sched->entry_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.entry-" + "threshold\"", entry_fn->data); + return -1; + } + } + else + { + alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_file_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2uint64 (exit_fn->data, + &alu_sched->exit_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.exit-" + "threshold\"", exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_file_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu.c->alu_init: = %"PRIu64",%"PRIu64"", + alu_sched->entry_limit.nr_files, + alu_sched->exit_limit.nr_files); + + } else if (strcmp (order_str, "disk-speed-usage") == 0) { + /* Disk speed */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_speed; + _threshold_fn->sched_value = get_stats_disk_speed; + entry_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.entry-threshold"); + if (entry_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "entry-threshold is given, " + "value is constant"); + } + _threshold_fn->entry_value = NULL; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.exit-threshold"); + if (exit_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "exit-threshold is given, " + "value is constant"); + } + _threshold_fn->exit_value = NULL; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + + } else { + gf_log ("alu", GF_LOG_DEBUG, + "%s, unknown option provided to scheduler", + order_str); + } + order_str = strtok_r (NULL, ":", &tmp_str); + } + + return 0; +} + +static int32_t +alu_init (xlator_t *xl) +{ + struct alu_sched *alu_sched = NULL; + struct alu_limits *_limit_fn = NULL; + struct alu_limits *tmp_limits = NULL; + uint32_t min_free_disk = 0; + data_t *limits = NULL; + + alu_sched = CALLOC (1, sizeof (struct alu_sched)); + ERR_ABORT (alu_sched); + + { + alu_parse_options (xl, alu_sched); + } + + /* Get the limits */ + + limits = dict_get (xl->options, + "scheduler.limits.min-free-disk"); + if (limits) { + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->min_value = get_stats_free_disk; + _limit_fn->cur_value = get_stats_free_disk; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + + if (gf_string2percent (limits->data, + &min_free_disk) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.limits." + "min-free-disk\"", limits->data); + return -1; + } + alu_sched->spec_limit.free_disk = min_free_disk; + + if (alu_sched->spec_limit.free_disk >= 100) { + gf_log ("alu", GF_LOG_ERROR, + "check the \"option scheduler." + "limits.min-free-disk\", it should " + "be percentage value"); + return -1; + } + alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */ + gf_log ("alu", GF_LOG_DEBUG, + "alu.limit.min-disk-free = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + limits = dict_get (xl->options, + "scheduler.limits.max-open-files"); + if (limits) { + // Update alu_sched->priority properly + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->max_value = get_stats_file_usage; + _limit_fn->cur_value = get_stats_file_usage; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + if (gf_string2uint64_base10 (limits->data, + &alu_sched->spec_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format '%s' of option " + "scheduler.limits.max-open-files", + limits->data); + return -1; + } + + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: limit.max-open-files = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + + /* Stats refresh options */ + limits = dict_get (xl->options, + "scheduler.refresh-interval"); + if (limits) { + if (gf_string2time (limits->data, + &alu_sched->refresh_interval) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "option scheduler.refresh-interval", + limits->data); + return -1; + } + } else { + alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT; + } + gettimeofday (&(alu_sched->last_stat_fetch), NULL); + + + limits = dict_get (xl->options, + "scheduler.alu.stat-refresh.num-file-create"); + if (limits) { + if (gf_string2uint32 (limits->data, + &alu_sched->refresh_create_count) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of \"option " + "alu.stat-refresh.num-file-create\"", + limits->data); + return -1; + } + } else { + alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT; + } + + { + /* Build an array of child_nodes */ + struct alu_sched_struct *sched_array = NULL; + xlator_list_t *trav_xl = xl->children; + data_t *data = NULL; + int32_t index = 0; + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + alu_sched->child_count = index; + sched_array = CALLOC (index, sizeof (struct alu_sched_struct)); + ERR_ABORT (sched_array); + trav_xl = xl->children; + index = 0; + while (trav_xl) { + sched_array[index].xl = trav_xl->xlator; + sched_array[index].eligible = 1; + index++; + trav_xl = trav_xl->next; + } + alu_sched->array = sched_array; + + data = dict_get (xl->options, + "scheduler.read-only-subvolumes"); + if (data) { + char *child = NULL; + char *tmp = NULL; + char *childs_data = strdup (data->data); + + child = strtok_r (childs_data, ",", &tmp); + while (child) { + for (index = 1; index < alu_sched->child_count; index++) { + if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) { + memcpy (&(alu_sched->array[index -1]), + &(alu_sched->array[alu_sched->child_count -1]), + sizeof (struct alu_sched_struct)); + alu_sched->child_count--; + break; + } + } + child = strtok_r (NULL, ",", &tmp); + } + } + } + + *((long *)xl->private) = (long)alu_sched; + + /* Initialize all the alu_sched structure's elements */ + { + alu_sched->sched_nodes_pending = 0; + + alu_sched->min_limit.free_disk = 0x00FFFFFF; + alu_sched->min_limit.disk_usage = 0xFFFFFFFF; + alu_sched->min_limit.total_disk_size = 0xFFFFFFFF; + alu_sched->min_limit.disk_speed = 0xFFFFFFFF; + alu_sched->min_limit.write_usage = 0xFFFFFFFF; + alu_sched->min_limit.read_usage = 0xFFFFFFFF; + alu_sched->min_limit.nr_files = 0xFFFFFFFF; + alu_sched->min_limit.nr_clients = 0xFFFFFFFF; + } + + pthread_mutex_init (&alu_sched->alu_mutex, NULL); + return 0; +} + +static void +alu_fini (xlator_t *xl) +{ + if (!xl) + return; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limit = alu_sched->limits_fn; + struct alu_threshold *threshold = alu_sched->threshold_fn; + void *tmp = NULL; + pthread_mutex_destroy (&alu_sched->alu_mutex); + free (alu_sched->array); + while (limit) { + tmp = limit; + limit = limit->next; + free (tmp); + } + while (threshold) { + tmp = threshold; + threshold = threshold->next; + free (tmp); + } + free (alu_sched); +} + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limits_fn = alu_sched->limits_fn; + int32_t idx = 0; + + pthread_mutex_lock (&alu_sched->alu_mutex); + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)cookie) + break; + } + pthread_mutex_unlock (&alu_sched->alu_mutex); + + if (op_ret == -1) { + alu_sched->array[idx].eligible = 0; + } else { + memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats)); + + /* Get stats from all the child node */ + /* Here check the limits specified by the user to + consider the nodes to be used by scheduler */ + alu_sched->array[idx].eligible = 1; + limits_fn = alu_sched->limits_fn; + while (limits_fn){ + if (limits_fn->max_value && + (limits_fn->cur_value (trav_stats) > + limits_fn->max_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + if (limits_fn->min_value && + (limits_fn->cur_value (trav_stats) < + limits_fn->min_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + limits_fn = limits_fn->next; + } + + /* Select minimum and maximum disk_usage */ + if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) { + alu_sched->max_limit.disk_usage = trav_stats->disk_usage; + } + if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) { + alu_sched->min_limit.disk_usage = trav_stats->disk_usage; + } + + /* Select minimum and maximum disk_speed */ + if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) { + alu_sched->max_limit.disk_speed = trav_stats->disk_speed; + } + if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) { + alu_sched->min_limit.disk_speed = trav_stats->disk_speed; + } + + /* Select minimum and maximum number of open files */ + if (trav_stats->nr_files > alu_sched->max_limit.nr_files) { + alu_sched->max_limit.nr_files = trav_stats->nr_files; + } + if (trav_stats->nr_files < alu_sched->min_limit.nr_files) { + alu_sched->min_limit.nr_files = trav_stats->nr_files; + } + + /* Select minimum and maximum write-usage */ + if (trav_stats->write_usage > alu_sched->max_limit.write_usage) { + alu_sched->max_limit.write_usage = trav_stats->write_usage; + } + if (trav_stats->write_usage < alu_sched->min_limit.write_usage) { + alu_sched->min_limit.write_usage = trav_stats->write_usage; + } + + /* Select minimum and maximum read-usage */ + if (trav_stats->read_usage > alu_sched->max_limit.read_usage) { + alu_sched->max_limit.read_usage = trav_stats->read_usage; + } + if (trav_stats->read_usage < alu_sched->min_limit.read_usage) { + alu_sched->min_limit.read_usage = trav_stats->read_usage; + } + + /* Select minimum and maximum free-disk */ + if (trav_stats->free_disk > alu_sched->max_limit.free_disk) { + alu_sched->max_limit.free_disk = trav_stats->free_disk; + } + if (trav_stats->free_disk < alu_sched->min_limit.free_disk) { + alu_sched->min_limit.free_disk = trav_stats->free_disk; + } + } + + STACK_DESTROY (frame->root); + + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t idx = 0; + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + for (idx = 0 ; idx < alu_sched->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + alu_sched->array[idx].xl, //cookie + alu_sched->array[idx].xl, + (alu_sched->array[idx].xl)->mops->stats, + 0); //flag + } + return; +} + +static void +alu_update (xlator_t *xl) +{ + struct timeval tv; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + + gettimeofday (&tv, NULL); + if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) { + /* Update the stats from all the server */ + update_stat_array (xl); + alu_sched->last_stat_fetch.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +alu_scheduler (xlator_t *xl, const void *path) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t sched_index = 0; + int32_t sched_index_orig = 0; + int32_t idx = 0; + + alu_update (xl); + + /* Now check each threshold one by one if some nodes are classified */ + { + struct alu_threshold *trav_threshold = alu_sched->threshold_fn; + struct alu_threshold *tmp_threshold = alu_sched->sched_method; + struct alu_sched_node *tmp_sched_node; + + /* This pointer 'trav_threshold' contains function pointers according to spec file + give by user, */ + while (trav_threshold) { + /* This check is needed for seeing if already there are nodes in this criteria + to be scheduled */ + if (!alu_sched->sched_nodes_pending) { + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (!alu_sched->array[idx].eligible) { + continue; + } + if (trav_threshold->entry_value) { + if (trav_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[idx].stats)) < + trav_threshold->entry_value (&(alu_sched->entry_limit))) { + continue; + } + } + tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node)); + ERR_ABORT (tmp_sched_node); + tmp_sched_node->index = idx; + if (!alu_sched->sched_node) { + alu_sched->sched_node = tmp_sched_node; + } else { + pthread_mutex_lock (&alu_sched->alu_mutex); + tmp_sched_node->next = alu_sched->sched_node; + alu_sched->sched_node = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + alu_sched->sched_nodes_pending++; + } + } /* end of if (sched_nodes_pending) */ + + /* This loop is required to check the eligible nodes */ + struct alu_sched_node *trav_sched_node; + while (alu_sched->sched_nodes_pending) { + trav_sched_node = alu_sched->sched_node; + sched_index = trav_sched_node->index; + if (alu_sched->array[sched_index].eligible) + break; + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + alu_sched->sched_nodes_pending--; + } + if (alu_sched->sched_nodes_pending) { + /* There are some node in this criteria to be scheduled, no need + * to sort and check other methods + */ + if (tmp_threshold && tmp_threshold->exit_value) { + /* verify the exit value && whether node is eligible or not */ + if (tmp_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[sched_index].stats)) > + tmp_threshold->exit_value (&(alu_sched->exit_limit))) { + /* Free the allocated info for the node :) */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } else { + /* if there is no exit value, then exit after scheduling once */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + + alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */ + + /* */ + if (trav_sched_node) { + tmp_sched_node = trav_sched_node; + while (trav_sched_node->next) { + trav_sched_node = trav_sched_node->next; + } + if (tmp_sched_node->next) { + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = tmp_sched_node->next; + tmp_sched_node->next = NULL; + trav_sched_node->next = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } + /* return the scheduled node */ + return alu_sched->array[sched_index].xl; + } /* end of if (pending_nodes) */ + + tmp_threshold = trav_threshold; + trav_threshold = trav_threshold->next; + } + } + + /* This is used only when there is everything seems ok, or no eligible nodes */ + sched_index_orig = alu_sched->sched_index; + alu_sched->sched_method = NULL; + while (1) { + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + sched_index = alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + if (alu_sched->array[sched_index].eligible) + break; + if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) { + gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule"); + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + break; + } + } + return alu_sched->array[sched_index].xl; +} + +/** + * notify + */ +void +alu_notify (xlator_t *xl, int32_t event, void *data) +{ + struct alu_sched *alu_sched = NULL; + int32_t idx = 0; + + alu_sched = (struct alu_sched *)*((long *)xl->private); + if (!alu_sched) + return; + + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //alu_sched->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + alu_sched->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = alu_init, + .fini = alu_fini, + .update = alu_update, + .schedule = alu_scheduler, + .notify = alu_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.alu.order", "alu.order" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.alu.disk-usage.entry-threshold", + "alu.disk-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.disk-usage.exit-threshold", + "alu.disk-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.entry-threshold", + "alu.write-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.exit-threshold", + "alu.write-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.entry-threshold", + "alu.read-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.exit-threshold", + "alu.read-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.open-files-usage.entry-threshold", + "alu.open-files-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.alu.open-files-usage.exit-threshold", + "alu.open-files-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.read-only-subvolumes", + "alu.read-only-subvolumes" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.refresh-interval", + "alu.refresh-interval", + "alu.stat-refresh.interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "alu.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.alu.stat-refresh.num-file-create" + "alu.stat-refresh.num-file-create"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {NULL}, } +}; diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h new file mode 100644 index 000000000..a958bb4d2 --- /dev/null +++ b/scheduler/alu/src/alu.h @@ -0,0 +1,89 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _ALU_H +#define _ALU_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" + +struct alu_sched; + +struct alu_sched_struct { + xlator_t *xl; + struct xlator_stats stats; + unsigned char eligible; +}; + +// Write better name for these functions +struct alu_limits { + struct alu_limits *next; + int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */ + int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */ + int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */ +}; + +struct alu_threshold { + struct alu_threshold *next; + int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */ + int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */ + int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */ + int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */ +}; + +struct alu_sched_node { + struct alu_sched_node *next; + int32_t index; +}; + +struct alu_sched { + struct alu_limits *limits_fn; + struct alu_threshold *threshold_fn; + struct alu_sched_struct *array; + struct alu_sched_node *sched_node; + struct alu_threshold *sched_method; + struct xlator_stats max_limit; + struct xlator_stats min_limit; + struct xlator_stats entry_limit; + struct xlator_stats exit_limit; + struct xlator_stats spec_limit; /* User given limit */ + + pthread_mutex_t alu_mutex; + struct timeval last_stat_fetch; + uint32_t refresh_interval; /* in seconds */ + uint32_t refresh_create_count; /* num-file-create */ + + int32_t sched_nodes_pending; + + int32_t sched_index; /* used for round robin scheduling in case of many nodes getting the criteria match. */ + int32_t child_count; + +}; + +struct _alu_local_t { + int32_t call_count; +}; + +typedef struct _alu_local_t alu_local_t; + +#endif /* _ALU_H */ diff --git a/scheduler/nufa/Makefile.am b/scheduler/nufa/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/nufa/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/nufa/src/Makefile.am b/scheduler/nufa/src/Makefile.am new file mode 100644 index 000000000..6eb3d39f1 --- /dev/null +++ b/scheduler/nufa/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = nufa.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +nufa_la_LDFLAGS = -module -avoidversion + +nufa_la_SOURCES = nufa.c +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/nufa/src/nufa.c b/scheduler/nufa/src/nufa.c new file mode 100644 index 000000000..bbc61e2ad --- /dev/null +++ b/scheduler/nufa/src/nufa.c @@ -0,0 +1,403 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +#include "scheduler.h" +#include "common-utils.h" + +struct nufa_sched_struct { + xlator_t *xl; + struct timeval last_stat_fetch; + int64_t free_disk; + int32_t refresh_interval; + unsigned char eligible; +}; + +struct nufa_struct { + struct nufa_sched_struct *array; + struct timeval last_stat_fetch; + + int32_t *local_array; /* Used to keep the index of the local xlators */ + int32_t local_xl_index; /* index in the above array */ + int32_t local_xl_count; /* Count of the local subvolumes */ + + uint32_t refresh_interval; + uint32_t min_free_disk; + + gf_lock_t nufa_lock; + int32_t child_count; + int32_t sched_index; +}; + +#define NUFA_LIMITS_MIN_FREE_DISK_DEFAULT 15 +#define NUFA_REFRESH_INTERVAL_DEFAULT 30 + +static int32_t +nufa_init (xlator_t *xl) +{ + int32_t index = 0; + data_t *local_name = NULL; + data_t *data = NULL; + xlator_list_t *trav_xl = xl->children; + struct nufa_struct *nufa_buf = NULL; + + nufa_buf = CALLOC (1, sizeof (struct nufa_struct)); + ERR_ABORT (nufa_buf); + + data = dict_get (xl->options, "scheduler.limits.min-free-disk"); + if (data) { + if (gf_string2percent (data->data, + &nufa_buf->min_free_disk) != 0) { + gf_log ("nufa", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option scheduler.limits.min-free-disk\"", + data->data); + return -1; + } + if (nufa_buf->min_free_disk >= 100) { + gf_log ("nufa", GF_LOG_ERROR, + "check \"option scheduler.limits.min-free-disk" + "\", it should be percentage value"); + return -1; + } + } else { + gf_log ("nufa", GF_LOG_WARNING, + "No option for limit min-free-disk given, " + "defaulting it to 15%%"); + nufa_buf->min_free_disk = NUFA_LIMITS_MIN_FREE_DISK_DEFAULT; + } + data = dict_get (xl->options, "scheduler.refresh-interval"); + if (data && (gf_string2time (data->data, + &nufa_buf->refresh_interval) != 0)) { + gf_log ("nufa", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option scheduler.refresh-interval\"", + data->data); + return -1; + } else { + gf_log ("nufa", GF_LOG_WARNING, + "No option for scheduler.refresh-interval given, " + "defaulting it to 30"); + nufa_buf->refresh_interval = NUFA_REFRESH_INTERVAL_DEFAULT; + } + + /* Get the array built */ + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + nufa_buf->child_count = index; + nufa_buf->sched_index = 0; + nufa_buf->array = CALLOC (index, sizeof (struct nufa_sched_struct)); + ERR_ABORT (nufa_buf->array); + nufa_buf->local_array = CALLOC (index, sizeof (int32_t)); + ERR_ABORT (nufa_buf->array); + trav_xl = xl->children; + + local_name = dict_get (xl->options, "scheduler.local-volume-name"); + if (!local_name) { + /* Error */ + gf_log ("nufa", GF_LOG_ERROR, + "No 'local-volume-name' option given in volume file"); + FREE (nufa_buf->array); + FREE (nufa_buf->local_array); + FREE (nufa_buf); + return -1; + } + + /* Get the array properly */ + index = 0; + trav_xl = xl->children; + while (trav_xl) { + nufa_buf->array[index].xl = trav_xl->xlator; + nufa_buf->array[index].eligible = 1; + nufa_buf->array[index].free_disk = nufa_buf->min_free_disk; + nufa_buf->array[index].refresh_interval = + nufa_buf->refresh_interval; + + trav_xl = trav_xl->next; + index++; + } + + { + int32_t array_index = 0; + char *child = NULL; + char *tmp = NULL; + char *childs_data = strdup (local_name->data); + + child = strtok_r (childs_data, ",", &tmp); + while (child) { + /* Check if the local_volume specified is proper + subvolume of unify */ + trav_xl = xl->children; + index=0; + while (trav_xl) { + if (strcmp (child, trav_xl->xlator->name) == 0) + break; + trav_xl = trav_xl->next; + index++; + } + + if (!trav_xl) { + /* entry for 'local-volume-name' is wrong, + not present in subvolumes */ + gf_log ("nufa", GF_LOG_ERROR, + "option 'scheduler.local-volume-name' " + "%s is wrong", child); + FREE (nufa_buf->array); + FREE (nufa_buf->local_array); + FREE (nufa_buf); + return -1; + } else { + nufa_buf->local_array[array_index++] = index; + nufa_buf->local_xl_count++; + } + child = strtok_r (NULL, ",", &tmp); + } + free (childs_data); + } + + LOCK_INIT (&nufa_buf->nufa_lock); + *((long *)xl->private) = (long)nufa_buf; // put it at the proper place + return 0; +} + +static void +nufa_fini (xlator_t *xl) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + + LOCK_DESTROY (&nufa_buf->nufa_lock); + FREE (nufa_buf->local_array); + FREE (nufa_buf->array); + FREE (nufa_buf); +} + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + struct nufa_struct *nufa_struct = NULL; + int32_t idx = 0; + int32_t percent = 0; + + nufa_struct = (struct nufa_struct *)*((long *)xl->private); + LOCK (&nufa_struct->nufa_lock); + for (idx = 0; idx < nufa_struct->child_count; idx++) { + if (nufa_struct->array[idx].xl->name == (char *)cookie) + break; + } + UNLOCK (&nufa_struct->nufa_lock); + + if (op_ret == 0) { + percent = ((trav_stats->free_disk * 100) / + trav_stats->total_disk_size); + if (nufa_struct->array[idx].free_disk > percent) { + if (nufa_struct->array[idx].eligible) + gf_log ("nufa", GF_LOG_CRITICAL, + "node \"%s\" is _almost_ (%d %%) full", + nufa_struct->array[idx].xl->name, + 100 - percent); + nufa_struct->array[idx].eligible = 0; + } else { + nufa_struct->array[idx].eligible = 1; + } + } else { + nufa_struct->array[idx].eligible = 0; + } + + STACK_DESTROY (frame->root); + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + /* This function schedules the file in one of the child nodes */ + int32_t idx; + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + for (idx = 0; idx < nufa_buf->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + nufa_buf->array[idx].xl->name, + nufa_buf->array[idx].xl, + (nufa_buf->array[idx].xl)->mops->stats, + 0); //flag + } + + return; +} + +static void +nufa_update (xlator_t *xl) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + struct timeval tv; + gettimeofday (&tv, NULL); + if (tv.tv_sec > (nufa_buf->refresh_interval + + nufa_buf->last_stat_fetch.tv_sec)) { + /* Update the stats from all the server */ + update_stat_array (xl); + nufa_buf->last_stat_fetch.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +nufa_schedule (xlator_t *xl, const void *path) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + int32_t nufa_orig = nufa_buf->local_xl_index; + int32_t rr; + + nufa_update (xl); + + while (1) { + LOCK (&nufa_buf->nufa_lock); + rr = nufa_buf->local_xl_index++; + nufa_buf->local_xl_index %= nufa_buf->local_xl_count; + UNLOCK (&nufa_buf->nufa_lock); + + /* if 'eligible' or there are _no_ eligible nodes */ + if (nufa_buf->array[nufa_buf->local_array[rr]].eligible) { + /* Return the local node */ + return nufa_buf->array[nufa_buf->local_array[rr]].xl; + } + if ((rr + 1) % nufa_buf->local_xl_count == nufa_orig) { + gf_log ("nufa", GF_LOG_CRITICAL, + "No free space available on any local " + "volumes, using RR scheduler"); + LOCK (&nufa_buf->nufa_lock); + nufa_buf->local_xl_index++; + nufa_buf->local_xl_index %= nufa_buf->local_xl_count; + UNLOCK (&nufa_buf->nufa_lock); + break; + } + } + + nufa_orig = nufa_buf->sched_index; + while (1) { + LOCK (&nufa_buf->nufa_lock); + rr = nufa_buf->sched_index++; + nufa_buf->sched_index = (nufa_buf->sched_index % + nufa_buf->child_count); + UNLOCK (&nufa_buf->nufa_lock); + + /* if 'eligible' or there are _no_ eligible nodes */ + if (nufa_buf->array[rr].eligible) { + break; + } + if ((rr + 1) % nufa_buf->child_count == nufa_orig) { + gf_log ("nufa", GF_LOG_CRITICAL, + "No free space available on any server, " + "using RR scheduler."); + LOCK (&nufa_buf->nufa_lock); + nufa_buf->sched_index++; + nufa_buf->sched_index = (nufa_buf->sched_index % + nufa_buf->child_count); + UNLOCK (&nufa_buf->nufa_lock); + break; + } + } + return nufa_buf->array[rr].xl; +} + + +/** + * notify + */ +void +nufa_notify (xlator_t *xl, int32_t event, void *data) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + int32_t idx = 0; + + if (!nufa_buf) + return; + + for (idx = 0; idx < nufa_buf->child_count; idx++) { + if (strcmp (nufa_buf->array[idx].xl->name, + ((xlator_t *)data)->name) == 0) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //nufa_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + nufa_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = nufa_init, + .fini = nufa_fini, + .update = nufa_update, + .schedule = nufa_schedule, + .notify = nufa_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "nufa.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "nufa.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.local-volume-name", + "nufa.local-volume-name" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {NULL} } +}; + diff --git a/scheduler/random/Makefile.am b/scheduler/random/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/random/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/random/src/Makefile.am b/scheduler/random/src/Makefile.am new file mode 100644 index 000000000..572181336 --- /dev/null +++ b/scheduler/random/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = random.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +random_la_LDFLAGS = -module -avoidversion + +random_la_SOURCES = random.c +random_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = random.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/random/src/random.c b/scheduler/random/src/random.c new file mode 100644 index 000000000..9e761d08a --- /dev/null +++ b/scheduler/random/src/random.c @@ -0,0 +1,283 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "random.h" + +#define RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT 15 +#define RANDOM_REFRESH_INTERVAL_DEFAULT 10 + + +static int32_t +random_init (xlator_t *xl) +{ + struct random_struct *random_buf = NULL; + xlator_list_t *trav_xl = xl->children; + data_t *limit = NULL; + int32_t index = 0; + + random_buf = CALLOC (1, sizeof (struct random_struct)); + ERR_ABORT (random_buf); + + /* Set the seed for the 'random' function */ + srandom ((uint32_t) time (NULL)); + + limit = dict_get (xl->options, "scheduler.limits.min-free-disk"); + if (limit) { + if (gf_string2percent (data_to_str (limit), + &random_buf->min_free_disk) != 0) { + gf_log ("random", GF_LOG_ERROR, + "invalid number format \"%s\" of \"option " + "scheduler.limits.min-free-disk\"", + limit->data); + return -1; + } + if (random_buf->min_free_disk >= 100) { + gf_log ("random", GF_LOG_ERROR, + "check the \"option random.limits.min-free" + "-disk\", it should be percentage value"); + return -1; + } + + } else { + gf_log ("random", GF_LOG_WARNING, + "No option for limit min-free-disk given, " + "defaulting it to 5%%"); + random_buf->min_free_disk = + RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT; + } + + limit = dict_get (xl->options, "scheduler.refresh-interval"); + if (limit) { + if (gf_string2time (data_to_str (limit), + &random_buf->refresh_interval) != 0) { + gf_log ("random", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option random.refresh-interval\"", + limit->data); + return -1; + } + } else { + random_buf->refresh_interval = RANDOM_REFRESH_INTERVAL_DEFAULT; + } + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + random_buf->child_count = index; + random_buf->array = CALLOC (index, + sizeof (struct random_sched_struct)); + ERR_ABORT (random_buf->array); + trav_xl = xl->children; + index = 0; + + while (trav_xl) { + random_buf->array[index].xl = trav_xl->xlator; + random_buf->array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + pthread_mutex_init (&random_buf->random_mutex, NULL); + + // put it at the proper place + *((long *)xl->private) = (long)random_buf; + return 0; +} + +static void +random_fini (xlator_t *xl) +{ + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + pthread_mutex_destroy (&random_buf->random_mutex); + free (random_buf->array); + free (random_buf); +} + + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + int32_t idx = 0; + int32_t percent = 0; + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + + pthread_mutex_lock (&random_buf->random_mutex); + for (idx = 0; idx < random_buf->child_count; idx++) { + if (strcmp (random_buf->array[idx].xl->name, + (char *)cookie) == 0) + break; + } + pthread_mutex_unlock (&random_buf->random_mutex); + + if (op_ret == 0) { + percent = ((trav_stats->free_disk *100) / + trav_stats->total_disk_size); + if (random_buf->min_free_disk > percent) { + random_buf->array[idx].eligible = 0; + } else { + random_buf->array[idx].eligible = 1; + } + } else { + random_buf->array[idx].eligible = 0; + } + + STACK_DESTROY (frame->root); + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + int32_t idx; + struct random_struct *random_buf = NULL; + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + random_buf = (struct random_struct *)*((long *)xl->private); + for (idx = 0; idx < random_buf->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + random_buf->array[idx].xl->name, + random_buf->array[idx].xl, + (random_buf->array[idx].xl)->mops->stats, + 0); + } + return ; +} + +static void +random_update (xlator_t *xl) +{ + struct timeval tv; + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + + gettimeofday(&tv, NULL); + if (tv.tv_sec > (random_buf->refresh_interval + + random_buf->last_stat_entry.tv_sec)) { + update_stat_array (xl); + random_buf->last_stat_entry.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +random_schedule (xlator_t *xl, const void *path) +{ + struct random_struct *random_buf = NULL; + int32_t rand = 0; + int32_t try = 0; + + random_buf = (struct random_struct *)*((long *)xl->private); + + rand = (int32_t) (1.0*random_buf->child_count * + (random() / (RAND_MAX + 1.0))); + + random_update (xl); + + while (!random_buf->array[rand].eligible) { + if (try++ > 100) { + /* there is a chance of this becoming a + infinite loop otherwise. */ + break; + } + rand = (int32_t) (1.0*random_buf->child_count * + (random() / (RAND_MAX + 1.0))); + } + return random_buf->array[rand].xl; +} + + +/** + * notify + */ +void +random_notify (xlator_t *xl, int32_t event, void *data) +{ + struct random_struct *random_buf = NULL; + int32_t idx = 0; + + random_buf = (struct random_struct *)*((long *)xl->private); + if (!random_buf) + return; + + for (idx = 0; idx < random_buf->child_count; idx++) { + if (random_buf->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //random_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + random_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = random_init, + .fini = random_fini, + .update = random_update, + .schedule = random_schedule, + .notify = random_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "random.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "random.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = {NULL} } +}; diff --git a/scheduler/random/src/random.h b/scheduler/random/src/random.h new file mode 100644 index 000000000..35c9e02ee --- /dev/null +++ b/scheduler/random/src/random.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _RANDOM_H +#define _RANDOM_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include +#include "scheduler.h" + +struct random_sched_struct { + xlator_t *xl; + unsigned char eligible; +}; + +struct random_struct { + int32_t child_count; + uint32_t refresh_interval; + uint32_t min_free_disk; + struct timeval last_stat_entry; + struct random_sched_struct *array; + pthread_mutex_t random_mutex; +}; + +#endif /* _RANDOM_H */ diff --git a/scheduler/rr/Makefile.am b/scheduler/rr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/rr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/rr/src/Makefile.am b/scheduler/rr/src/Makefile.am new file mode 100644 index 000000000..7e911c0ed --- /dev/null +++ b/scheduler/rr/src/Makefile.am @@ -0,0 +1,13 @@ +sched_LTLIBRARIES = rr.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +rr_la_LDFLAGS = -module -avoidversion + +rr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +rr_la_SOURCES = rr.c rr-options.c +noinst_HEADERS = rr.h rr-options.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/rr/src/rr-options.c b/scheduler/rr/src/rr-options.c new file mode 100644 index 000000000..3f0ffcaf2 --- /dev/null +++ b/scheduler/rr/src/rr-options.c @@ -0,0 +1,256 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include "scheduler.h" +#include "rr-options.h" + +#define RR_LIMITS_MIN_FREE_DISK_OPTION_STRING "scheduler.limits.min-free-disk" +#define RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT 15 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MIN 0 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MAX 100 + +#define RR_REFRESH_INTERVAL_OPTION_STRING "scheduler.refresh-interval" +#define RR_REFRESH_INTERVAL_VALUE_DEFAULT 10 + +#define RR_READ_ONLY_SUBVOLUMES_OPTION_STRING "scheduler.read-only-subvolumes" + +#define LOG_ERROR(args...) gf_log ("rr-options", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...) gf_log ("rr-options", GF_LOG_WARNING, ##args) + +static int +_rr_options_min_free_disk_validate (const char *value_string, uint32_t *n) +{ + uint32_t value = 0; + + if (value_string == NULL) + { + return -1; + } + + if (gf_string2percent (value_string, &value) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid number format [%s] of option [%s]", + value_string, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); + return -1; + } + + if ((value <= RR_LIMITS_MIN_FREE_DISK_VALUE_MIN) || + (value >= RR_LIMITS_MIN_FREE_DISK_VALUE_MAX)) + { + gf_log ("rr", + GF_LOG_ERROR, + "out of range [%d] of option [%s]. Allowed range is 0 to 100.", + value, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); + return -1; + } + + *n = value; + + return 0; +} + +static int +_rr_options_refresh_interval_validate (const char *value_string, uint32_t *n) +{ + uint32_t value = 0; + + if (value_string == NULL) + { + return -1; + } + + if (gf_string2time (value_string, &value) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid number format [%s] of option [%s]", + value_string, + RR_REFRESH_INTERVAL_OPTION_STRING); + return -1; + } + + *n = value; + + return 0; +} + +static int +_rr_options_read_only_subvolumes_validate (const char *value_string, + char ***volume_list, + uint64_t *volume_count) +{ + char **vlist = NULL; + int vcount = 0; + int i = 0; + + if (value_string == NULL || volume_list == NULL || volume_count) + { + return -1; + } + + if (gf_strsplit (value_string, + ", ", + &vlist, + &vcount) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid subvolume list [%s] of option [%s]", + value_string, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); + return -1; + } + + for (i = 0; i < vcount; i++) + { + if (gf_volume_name_validate (vlist[i]) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid subvolume name [%s] in [%s] of option [%s]", + vlist[i], + value_string, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); + goto free_exit; + } + } + + *volume_list = vlist; + *volume_count = vcount; + + return 0; + + free_exit: + for (i = 0; i < vcount; i++) + { + free (vlist[i]); + } + free (vlist); + + return -1; +} + +int +rr_options_validate (dict_t *options, rr_options_t *rr_options) +{ + char *value_string = NULL; + + if (options == NULL || rr_options == NULL) + { + return -1; + } + + if (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)) + if (data_to_str (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING))) + value_string = data_to_str (dict_get (options, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_min_free_disk_validate (value_string, + &rr_options->min_free_disk) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = %d", + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, + rr_options->min_free_disk); + } + else + { + rr_options->min_free_disk = RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT; + + gf_log ("rr", GF_LOG_DEBUG, + "using %s = %d [default]", + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, + rr_options->min_free_disk); + } + + value_string = NULL; + if (dict_get (options, RR_REFRESH_INTERVAL_OPTION_STRING)) + value_string = data_to_str (dict_get (options, + RR_REFRESH_INTERVAL_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_refresh_interval_validate (value_string, + &rr_options->refresh_interval) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = %d", + RR_REFRESH_INTERVAL_OPTION_STRING, + rr_options->refresh_interval); + } + else + { + rr_options->refresh_interval = RR_REFRESH_INTERVAL_VALUE_DEFAULT; + + gf_log ("rr", GF_LOG_DEBUG, + "using %s = %d [default]", + RR_REFRESH_INTERVAL_OPTION_STRING, + rr_options->refresh_interval); + } + + value_string = NULL; + if (dict_get (options, RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)) + value_string = data_to_str (dict_get (options, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_read_only_subvolumes_validate (value_string, + &rr_options->read_only_subvolume_list, + &rr_options->read_only_subvolume_count) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = [%s]", + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING, + value_string); + } + + return 0; +} + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "rr.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "rr.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.read-only-subvolumes", + "rr.read-only-subvolumes" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/scheduler/rr/src/rr-options.h b/scheduler/rr/src/rr-options.h new file mode 100644 index 000000000..4818c7d49 --- /dev/null +++ b/scheduler/rr/src/rr-options.h @@ -0,0 +1,34 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _RR_OPTIONS_H +#define _RR_OPTIONS_H + +struct rr_options +{ + uint32_t min_free_disk; + uint32_t refresh_interval; + char **read_only_subvolume_list; + uint64_t read_only_subvolume_count; +}; +typedef struct rr_options rr_options_t; + +int rr_options_validate (dict_t *options, rr_options_t *rr_options); + +#endif diff --git a/scheduler/rr/src/rr.c b/scheduler/rr/src/rr.c new file mode 100644 index 000000000..3e54ff5e1 --- /dev/null +++ b/scheduler/rr/src/rr.c @@ -0,0 +1,565 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include + +#include + +#include "scheduler.h" + +#include "rr-options.h" +#include "rr.h" + +#define RR_MIN_FREE_DISK_NOT_REACHED 0 +#define RR_MIN_FREE_DISK_REACHED 1 + +#define RR_SUBVOLUME_OFFLINE 0 +#define RR_SUBVOLUME_ONLINE 1 + +#define LOG_ERROR(args...) gf_log ("rr", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...) gf_log ("rr", GF_LOG_WARNING, ##args) +#define LOG_CRITICAL(args...) gf_log ("rr", GF_LOG_CRITICAL, ##args) + +#define ROUND_ROBIN(index, count) ((index + 1) % count) + +static int +_cleanup_rr (rr_t *rr) +{ + int i; + + if (rr == NULL) + { + return -1; + } + + if (rr->options.read_only_subvolume_list != NULL) + { + for (i = 0; i < rr->options.read_only_subvolume_count; i++) + { + free (rr->options.read_only_subvolume_list[i]); + } + free (rr->options.read_only_subvolume_list); + } + + free (rr->subvolume_list); + + free (rr); + + return 0; +} + +int +rr_init (xlator_t *this_xl) +{ + rr_t *rr = NULL; + dict_t *options = NULL; + xlator_list_t *children = NULL; + uint64_t children_count = 0; + int i = 0; + int j = 0; + + if (this_xl == NULL) + { + return -1; + } + + if ((options = this_xl->options) == NULL) + { + return -1; + } + + if ((children = this_xl->children) == NULL) + { + return -1; + } + + if ((rr = CALLOC (1, sizeof (rr_t))) == NULL) + { + return -1; + } + + if (rr_options_validate (options, &rr->options) != 0) + { + free (rr); + return -1; + } + + for (i = 0; i < rr->options.read_only_subvolume_count; i++) + { + char found = 0; + + for (children = this_xl->children; + children != NULL; + children = children->next) + { + if (strcmp (rr->options.read_only_subvolume_list[i], + children->xlator->name) == 0) + { + found = 1; + break; + } + } + + if (!found) + { + LOG_ERROR ("read-only subvolume [%s] not found in volume list", + rr->options.read_only_subvolume_list[i]); + _cleanup_rr (rr); + return -1; + } + } + + for (children = this_xl->children; + children != NULL; + children = children->next) + { + children_count++; + } + + /* bala: excluding read_only_subvolumes */ + if ((rr->subvolume_count = children_count - + rr->options.read_only_subvolume_count) == 0) + { + LOG_ERROR ("no writable volumes found for scheduling"); + _cleanup_rr (rr); + return -1; + } + + if ((rr->subvolume_list = CALLOC (rr->subvolume_count, + sizeof (rr_subvolume_t))) == NULL) + { + _cleanup_rr (rr); + return -1; + } + + i = 0; + j = 0; + for (children = this_xl->children; + children != NULL; + children = children->next) + { + char found = 0; + + for (j = 0; j < rr->options.read_only_subvolume_count; j++) + { + if (strcmp (rr->options.read_only_subvolume_list[i], + children->xlator->name) == 0) + { + found = 1; + break; + } + } + + if (!found) + { + rr_subvolume_t *subvolume = NULL; + + subvolume = &rr->subvolume_list[i]; + + subvolume->xl = children->xlator; + subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; + subvolume->status = RR_SUBVOLUME_ONLINE; + + i++; + } + } + + rr->schedule_index = UINT64_MAX; + rr->last_stat_fetched_time.tv_sec = 0; + rr->last_stat_fetched_time.tv_usec = 0; + pthread_mutex_init (&rr->mutex, NULL); + + *((long *)this_xl->private) = (long)rr; + + return 0; +} + +void +rr_fini (xlator_t *this_xl) +{ + rr_t *rr = NULL; + + if (this_xl == NULL) + { + return; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) != NULL) + { + pthread_mutex_destroy (&rr->mutex); + _cleanup_rr (rr); + this_xl->private = NULL; + } + + return; +} + +xlator_t * +rr_schedule (xlator_t *this_xl, const void *path) +{ + rr_t *rr = NULL; + uint64_t next_schedule_index = 0; + int i = 0; + + if (this_xl == NULL || path == NULL) + { + return NULL; + } + + rr = (rr_t *) *((long *)this_xl->private); + next_schedule_index = ROUND_ROBIN (rr->schedule_index, + rr->subvolume_count); + + rr_update (this_xl); + + for (i = next_schedule_index; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && + rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = 0; i < next_schedule_index; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && + rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = next_schedule_index; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = 0; i < next_schedule_index; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + return NULL; +} + +void +rr_update (xlator_t *this_xl) +{ + rr_t *rr = NULL; + struct timeval ctime = {0, 0}; + int i = 0; + + if (this_xl == NULL) + { + return ; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + return ; + } + + if (gettimeofday (&ctime, NULL) != 0) + { + return ; + } + + if (ctime.tv_sec > (rr->options.refresh_interval + + rr->last_stat_fetched_time.tv_sec)) + { + pthread_mutex_lock (&rr->mutex); + rr->last_stat_fetched_time = ctime; + pthread_mutex_unlock (&rr->mutex); + + for (i = 0; i < rr->subvolume_count; i++) + { + xlator_t *subvolume_xl = NULL; + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + + subvolume_xl = rr->subvolume_list[i].xl; + + pool = this_xl->ctx->pool; + + frame = create_frame (this_xl, pool); + + STACK_WIND_COOKIE (frame, + rr_update_cbk, + subvolume_xl->name, + subvolume_xl, + subvolume_xl->mops->stats, + 0); + } + } + + return ; +} + +int +rr_update_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + rr_t *rr = NULL; + rr_subvolume_t *subvolume = NULL; + uint8_t free_disk_percent = 0; + int i = 0; + + if (frame == NULL) + { + return -1; + } + + if (cookie == NULL || this_xl == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + if (op_ret == 0 && stats == NULL) + { + LOG_CRITICAL ("fatal! op_ret is 0 and stats is NULL. " + "Please report this to "); + STACK_DESTROY (frame->root); + return -1; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + for (i = 0; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].xl->name == (char *) cookie) + { + subvolume = &rr->subvolume_list[i]; + break; + } + } + + if (subvolume == NULL) + { + LOG_ERROR ("unknown cookie [%s]", (char *) cookie); + STACK_DESTROY (frame->root); + return -1; + } + + if (op_ret == 0) + { + free_disk_percent = (stats->free_disk * 100) / stats->total_disk_size; + if (free_disk_percent > rr->options.min_free_disk) + { + if (subvolume->free_disk_status != RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; + pthread_mutex_unlock (&rr->mutex); + LOG_WARNING ("subvolume [%s] is available with free space for scheduling", + subvolume->xl->name); + } + } + else + { + if (subvolume->free_disk_status != RR_MIN_FREE_DISK_REACHED) + { + pthread_mutex_lock (&rr->mutex); + subvolume->free_disk_status = RR_MIN_FREE_DISK_REACHED; + pthread_mutex_unlock (&rr->mutex); + LOG_WARNING ("subvolume [%s] reached minimum disk space requirement", + subvolume->xl->name); + } + } + } + else + { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_OFFLINE; + pthread_mutex_unlock (&rr->mutex); + LOG_ERROR ("unable to get subvolume [%s] status information and " + "scheduling is disabled", + subvolume->xl->name); + } + + STACK_DESTROY (frame->root); + return 0; +} + +void +rr_notify (xlator_t *this_xl, int32_t event, void *data) +{ + rr_t *rr = NULL; + rr_subvolume_t *subvolume = NULL; + xlator_t *subvolume_xl = NULL; + int i = 0, ret = 0; + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + dict_t *xattr = get_new_dict (); + int32_t version[1] = {1}; + + if (this_xl == NULL || data == NULL) { + return ; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) { + return ; + } + + subvolume_xl = (xlator_t *) data; + + for (i = 0; i < rr->subvolume_count; i++) { + if (rr->subvolume_list[i].xl == subvolume_xl) { + subvolume = &rr->subvolume_list[i]; + break; + } + } + + switch (event) { + case GF_EVENT_CHILD_UP: + /* Seeding, to be done only once */ + if (rr->first_time && (i == rr->subvolume_count)) { + loc_t loc = {0,}; + xlator_t *trav = NULL; + + pool = this_xl->ctx->pool; + frame = create_frame (this_xl, pool); + ret = dict_set_bin (xattr, "trusted.glusterfs.scheduler.rr", + version, sizeof (int32_t)); + if (-1 == ret) { + gf_log (this_xl->name, GF_LOG_ERROR, "rr seed setting failed"); + } + if (xattr) + dict_ref (xattr); + + loc.path = strdup ("/"); + for (trav = this_xl->parents->xlator; trav; trav = trav->parents->xlator) { + if (trav->itable) { + loc.inode = trav->itable->root; + break; + } + } + STACK_WIND (frame, + rr_notify_cbk, + (xlator_t *)data, + ((xlator_t *)data)->fops->xattrop, + &loc, + GF_XATTROP_ADD_ARRAY, + xattr); + + if (xattr) + dict_unref (xattr); + + rr->first_time = 0; + } + if (subvolume) { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_ONLINE; + pthread_mutex_unlock (&rr->mutex); + } + break; + case GF_EVENT_CHILD_DOWN: + if (subvolume) { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_OFFLINE; + pthread_mutex_unlock (&rr->mutex); + } + break; + } + + return ; +} + +int +rr_notify_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr) +{ + rr_t *rr = NULL; + int32_t *index = NULL; + int32_t ret = -1; + void *tmp_index_ptr = NULL; + + if (frame == NULL) + { + return -1; + } + + if ((this_xl == NULL) || (op_ret == -1)) + { + STACK_DESTROY (frame->root); + return -1; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + ret = dict_get_bin (xattr, "trusted.glusterfs.scheduler.rr", &tmp_index_ptr); + index = tmp_index_ptr; + if (ret == 0) + rr->schedule_index = (index[0] % rr->subvolume_count); + else + rr->schedule_index = 0; + + STACK_DESTROY (frame->root); + return 0; +} + +struct sched_ops sched = { + .init = rr_init, + .fini = rr_fini, + .update = rr_update, + .schedule = rr_schedule, + .notify = rr_notify +}; + diff --git a/scheduler/rr/src/rr.h b/scheduler/rr/src/rr.h new file mode 100644 index 000000000..baa471209 --- /dev/null +++ b/scheduler/rr/src/rr.h @@ -0,0 +1,70 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _RR_H +#define _RR_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include +#include + +struct rr_subvolume +{ + xlator_t *xl; + uint8_t free_disk_status; + uint8_t status; +}; +typedef struct rr_subvolume rr_subvolume_t; + +struct rr +{ + rr_options_t options; + rr_subvolume_t *subvolume_list; + uint64_t subvolume_count; + uint64_t schedule_index; + struct timeval last_stat_fetched_time; + pthread_mutex_t mutex; + char first_time; +}; +typedef struct rr rr_t; + +int rr_init (xlator_t *this_xl); +void rr_fini (xlator_t *this_xl); +xlator_t *rr_schedule (xlator_t *this_xl, const void *path); +void rr_update (xlator_t *this_xl); +int rr_update_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats); +void rr_notify (xlator_t *this_xl, int32_t event, void *data); +int rr_notify_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr); + +#endif /* _RR_H */ diff --git a/scheduler/switch/Makefile.am b/scheduler/switch/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/switch/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/switch/src/Makefile.am b/scheduler/switch/src/Makefile.am new file mode 100644 index 000000000..dc7d16d40 --- /dev/null +++ b/scheduler/switch/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = switch.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +switch_la_LDFLAGS = -module -avoidversion + +switch_la_SOURCES = switch.c +switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/switch/src/switch.c b/scheduler/switch/src/switch.c new file mode 100644 index 000000000..70b307187 --- /dev/null +++ b/scheduler/switch/src/switch.c @@ -0,0 +1,398 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "scheduler.h" + +struct switch_sched_array { + xlator_t *xl; + int32_t eligible; + int32_t considered; +}; + +/* Select one of this struct based on the path's pattern match */ +struct switch_sched_struct { + struct switch_sched_struct *next; + struct switch_sched_array *array; + char path_pattern[256]; + int32_t node_index; /* Index of the node in + this pattern. */ + int32_t num_child; /* Total num of child nodes + with this pattern. */ +}; + +struct switch_struct { + struct switch_sched_struct *cond; + struct switch_sched_array *array; + pthread_mutex_t switch_mutex; + int32_t child_count; +}; + +/* This function should return child node as '*:subvolumes' is inserterd */ +static xlator_t * +switch_get_matching_xl (const char *path, struct switch_sched_struct *cond) +{ + struct switch_sched_struct *trav = cond; + char *pathname = strdup (path); + int index = 0; + + while (trav) { + if (fnmatch (trav->path_pattern, + pathname, FNM_NOESCAPE) == 0) { + free (pathname); + trav->node_index %= trav->num_child; + index = (trav->node_index++) % trav->num_child; + return trav->array[index].xl; + } + trav = trav->next; + } + free (pathname); + return NULL; +} + + +static int32_t +switch_init (xlator_t *xl) +{ + int32_t index = 0; + data_t *data = NULL; + char *child = NULL; + char *tmp = NULL; + char *childs_data = NULL; + xlator_list_t *trav_xl = xl->children; + struct switch_struct *switch_buf = NULL; + + switch_buf = CALLOC (1, sizeof (struct switch_struct)); + ERR_ABORT (switch_buf); + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + switch_buf->child_count = index; + switch_buf->array = CALLOC (index + 1, + sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_buf->array); + trav_xl = xl->children; + index = 0; + + while (trav_xl) { + switch_buf->array[index].xl = trav_xl->xlator; + switch_buf->array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + + data = dict_get (xl->options, "scheduler.read-only-subvolumes"); + if (data) { + childs_data = strdup (data->data); + child = strtok_r (childs_data, ",", &tmp); + while (child) { + for (index = 1; + index < switch_buf->child_count; index++) { + if (strcmp (switch_buf->array[index - 1].xl->name, child) == 0) { + gf_log ("switch", GF_LOG_DEBUG, + "Child '%s' is read-only", + child); + memcpy (&(switch_buf->array[index-1]), + &(switch_buf->array[switch_buf->child_count - 1]), + sizeof (struct switch_sched_array)); + switch_buf->child_count--; + break; + } + } + child = strtok_r (NULL, ",", &tmp); + } + free (childs_data); + } + + data = dict_get (xl->options, "scheduler.local-volume-name"); + if (data) { + /* Means, give preference to that node first */ + gf_log ("switch", GF_LOG_DEBUG, + "local volume defined as %s", data->data); + + /* TODO: parse it properly, have an extra index to + specify that first */ + } + + /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ + data = dict_get (xl->options, "scheduler.switch.case"); + if (data) { + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *switch_str = NULL; + char *pattern = NULL; + char *childs = NULL; + struct switch_sched_struct *switch_opt = NULL; + struct switch_sched_struct *trav = NULL; + /* Get the pattern for considering switch case. + "option block-size *avi:10MB" etc */ + switch_str = strtok_r (data->data, ";", &tmp_str); + while (switch_str) { + dup_str = strdup (switch_str); + switch_opt = + CALLOC (1, + sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_opt); + + /* Link it to the main structure */ + if (switch_buf->cond) { + /* there are already few entries */ + trav = switch_buf->cond; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf->cond = switch_opt; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + childs = strtok_r (NULL, ":", &tmp_str1); + if (strncmp (pattern, "*", 2) == 0) { + gf_log ("switch", GF_LOG_WARNING, + "'*' pattern will be taken by default " + "for all the unconfigured child nodes," + " hence neglecting current option"); + switch_str = strtok_r (NULL, ";", &tmp_str); + free (dup_str); + continue; + } + memcpy (switch_opt->path_pattern, + pattern, strlen (pattern)); + if (childs) { + int32_t idx = 0; + char *tmp1 = NULL; + char *dup_childs = NULL; + /* TODO: get the list of child nodes for + the given pattern */ + dup_childs = strdup (childs); + child = strtok_r (dup_childs, ",", &tmp); + while (child) { + idx++; + child = strtok_r (NULL, ",", &tmp); + } + free (dup_childs); + child = strtok_r (childs, ",", &tmp1); + switch_opt->num_child = idx; + switch_opt->array = + CALLOC (1, idx * sizeof (struct switch_sched_array)); + ERR_ABORT (switch_opt->array); + idx = 0; + child = strtok_r (childs, ",", &tmp); + while (child) { + for (index = 1; + index < switch_buf->child_count; + index++) { + if (strcmp (switch_buf->array[index - 1].xl->name, + child) == 0) { + gf_log ("switch", + GF_LOG_DEBUG, + "'%s' pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, child); + /* + if (switch_buf->array[index-1].considered) { + gf_log ("switch", GF_LOG_DEBUG, + "ambiguity found, exiting"); + return -1; + } + */ + switch_opt->array[idx].xl = switch_buf->array[index-1].xl; + switch_buf->array[index-1].considered = 1; + idx++; + break; + } + } + child = strtok_r (NULL, ",", &tmp1); + } + } else { + /* error */ + gf_log ("switch", GF_LOG_ERROR, + "Check \"scheduler.switch.case\" " + "option in unify volume. Exiting"); + free (switch_buf->array); + free (switch_buf); + return -1; + } + free (dup_str); + switch_str = strtok_r (NULL, ";", &tmp_str); + } + } + /* Now, all the pattern based considerations done, so for all the + * remaining pattern, '*' to all the remaining child nodes + */ + { + struct switch_sched_struct *switch_opt = NULL; + int32_t flag = 0; + int32_t index = 0; + for (index=0; index < switch_buf->child_count; index++) { + /* check for considered flag */ + if (switch_buf->array[index].considered) + continue; + flag++; + } + if (!flag) { + gf_log ("switch", GF_LOG_ERROR, + "No nodes left for pattern '*'. Exiting."); + return -1; + } + switch_opt = CALLOC (1, sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_opt); + if (switch_buf->cond) { + /* there are already few entries */ + struct switch_sched_struct *trav = switch_buf->cond; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf->cond = switch_opt; + } + /* Add the '*' pattern to the array */ + memcpy (switch_opt->path_pattern, "*", 2); + switch_opt->num_child = flag; + switch_opt->array = + CALLOC (1, flag * sizeof (struct switch_sched_array)); + ERR_ABORT (switch_opt->array); + flag = 0; + for (index=0; index < switch_buf->child_count; index++) { + /* check for considered flag */ + if (switch_buf->array[index].considered) + continue; + gf_log ("switch", GF_LOG_DEBUG, + "'%s' pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, + switch_buf->array[index].xl->name); + switch_opt->array[flag].xl = + switch_buf->array[index].xl; + switch_buf->array[index].considered = 1; + flag++; + } + } + + pthread_mutex_init (&switch_buf->switch_mutex, NULL); + + // put it at the proper place + *((long *)xl->private) = (long)switch_buf; + + return 0; +} + +static void +switch_fini (xlator_t *xl) +{ + /* TODO: free all the allocated entries */ + struct switch_struct *switch_buf = NULL; + switch_buf = (struct switch_struct *)*((long *)xl->private); + + pthread_mutex_destroy (&switch_buf->switch_mutex); + free (switch_buf->array); + free (switch_buf); +} + +static xlator_t * +switch_schedule (xlator_t *xl, const void *path) +{ + struct switch_struct *switch_buf = NULL; + switch_buf = (struct switch_struct *)*((long *)xl->private); + + return switch_get_matching_xl (path, switch_buf->cond); +} + + +/** + * notify + */ +void +switch_notify (xlator_t *xl, int32_t event, void *data) +{ + /* TODO: This should be checking in switch_sched_struct */ +#if 0 + struct switch_struct *switch_buf = NULL; + int32_t idx = 0; + + switch_buf = (struct switch_struct *)*((long *)xl->private); + if (!switch_buf) + return; + + for (idx = 0; idx < switch_buf->child_count; idx++) { + if (switch_buf->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + switch_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + switch_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } +#endif +} + +static void +switch_update (xlator_t *xl) +{ + return; +} + +struct sched_ops sched = { + .init = switch_init, + .fini = switch_fini, + .update = switch_update, + .schedule = switch_schedule, + .notify = switch_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.read-only-subvolumes" , + "switch.read-only-subvolumes"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.local-volume-name", + "switch.nufa.local-volume-name" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = { "scheduler.switch.case", + "switch.case" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; -- cgit