summaryrefslogtreecommitdiffstats
path: root/scheduler
diff options
context:
space:
mode:
Diffstat (limited to 'scheduler')
-rw-r--r--scheduler/Makefile.am3
-rw-r--r--scheduler/alu/Makefile.am3
-rw-r--r--scheduler/alu/src/Makefile.am14
-rw-r--r--scheduler/alu/src/alu.c993
-rw-r--r--scheduler/alu/src/alu.h89
-rw-r--r--scheduler/nufa/Makefile.am3
-rw-r--r--scheduler/nufa/src/Makefile.am12
-rw-r--r--scheduler/nufa/src/nufa.c403
-rw-r--r--scheduler/random/Makefile.am3
-rw-r--r--scheduler/random/src/Makefile.am14
-rw-r--r--scheduler/random/src/random.c283
-rw-r--r--scheduler/random/src/random.h46
-rw-r--r--scheduler/rr/Makefile.am3
-rw-r--r--scheduler/rr/src/Makefile.am13
-rw-r--r--scheduler/rr/src/rr-options.c256
-rw-r--r--scheduler/rr/src/rr-options.h34
-rw-r--r--scheduler/rr/src/rr.c565
-rw-r--r--scheduler/rr/src/rr.h70
-rw-r--r--scheduler/switch/Makefile.am3
-rw-r--r--scheduler/switch/src/Makefile.am12
-rw-r--r--scheduler/switch/src/switch.c398
21 files changed, 3220 insertions, 0 deletions
diff --git a/scheduler/Makefile.am b/scheduler/Makefile.am
new file mode 100644
index 00000000000..618fa7dd8b8
--- /dev/null
+++ b/scheduler/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = alu random nufa rr switch
+
+CLEANFILES =
diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/scheduler/alu/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am
new file mode 100644
index 00000000000..eb7d0db07e0
--- /dev/null
+++ b/scheduler/alu/src/Makefile.am
@@ -0,0 +1,14 @@
+sched_LTLIBRARIES = alu.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+alu_la_LDFLAGS = -module -avoidversion
+
+alu_la_SOURCES = alu.c
+alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = alu.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c
new file mode 100644
index 00000000000..754c5e35352
--- /dev/null
+++ b/scheduler/alu/src/alu.c
@@ -0,0 +1,993 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/* ALU code needs a complete re-write. This is one of the most important
+ * part of GlusterFS and so needs more and more reviews and testing
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+#include <stdint.h>
+#include "stack.h"
+#include "alu.h"
+
+#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT (1 * GF_UNIT_GB)
+#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT (512 * GF_UNIT_MB)
+
+#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT 25
+#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT 5
+
+#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT 25
+#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT 5
+
+#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT 1000
+#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT 100
+
+#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT 100
+
+#define ALU_REFRESH_INTERVAL_DEFAULT 5
+#define ALU_REFRESH_CREATE_COUNT_DEFAULT 5
+
+
+static int64_t
+get_stats_disk_usage (struct xlator_stats *this)
+{
+ return this->disk_usage;
+}
+
+static int64_t
+get_stats_write_usage (struct xlator_stats *this)
+{
+ return this->write_usage;
+}
+
+static int64_t
+get_stats_read_usage (struct xlator_stats *this)
+{
+ return this->read_usage;
+}
+
+static int64_t
+get_stats_disk_speed (struct xlator_stats *this)
+{
+ return this->disk_speed;
+}
+
+static int64_t
+get_stats_file_usage (struct xlator_stats *this)
+{
+ /* Avoid warning "defined but not used" */
+ (void) &get_stats_file_usage;
+
+ return this->nr_files;
+}
+
+static int64_t
+get_stats_free_disk (struct xlator_stats *this)
+{
+ if (this->total_disk_size > 0)
+ return (this->free_disk * 100) / this->total_disk_size;
+ return 0;
+}
+
+static int64_t
+get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+ return (max->write_usage - min->write_usage);
+}
+
+static int64_t
+get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+ return (max->read_usage - min->read_usage);
+}
+
+static int64_t
+get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+ return (max->disk_usage - min->disk_usage);
+}
+
+static int64_t
+get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min)
+{
+ return (max->disk_speed - min->disk_speed);
+}
+
+static int64_t
+get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+ return (max->nr_files - min->nr_files);
+}
+
+
+int
+alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched)
+{
+ data_t *order = dict_get (xl->options, "scheduler.alu.order");
+ if (!order) {
+ gf_log (xl->name, GF_LOG_ERROR,
+ "option 'scheduler.alu.order' not specified");
+ return -1;
+ }
+ struct alu_threshold *_threshold_fn;
+ struct alu_threshold *tmp_threshold;
+ data_t *entry_fn = NULL;
+ data_t *exit_fn = NULL;
+ char *tmp_str = NULL;
+ char *order_str = strtok_r (order->data, ":", &tmp_str);
+ /* Get the scheduling priority order, specified by the user. */
+ while (order_str) {
+ gf_log ("alu", GF_LOG_DEBUG,
+ "alu_init: order string: %s",
+ order_str);
+ if (strcmp (order_str, "disk-usage") == 0) {
+ /* Disk usage */
+ _threshold_fn =
+ CALLOC (1, sizeof (struct alu_threshold));
+ ERR_ABORT (_threshold_fn);
+ _threshold_fn->diff_value = get_max_diff_disk_usage;
+ _threshold_fn->sched_value = get_stats_disk_usage;
+ entry_fn =
+ dict_get (xl->options,
+ "scheduler.alu.disk-usage.entry-threshold");
+ if (entry_fn) {
+ if (gf_string2bytesize (entry_fn->data,
+ &alu_sched->entry_limit.disk_usage) != 0) {
+ gf_log (xl->name, GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "disk-usage.entry-threshold\"",
+ entry_fn->data);
+ return -1;
+ }
+ } else {
+ alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->entry_value = get_stats_disk_usage;
+ exit_fn = dict_get (xl->options,
+ "scheduler.alu.disk-usage.exit-threshold");
+ if (exit_fn) {
+ if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0) {
+ gf_log (xl->name, GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "disk-usage.exit-threshold\"",
+ exit_fn->data);
+ return -1;
+ }
+ } else {
+ alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->exit_value = get_stats_disk_usage;
+ tmp_threshold = alu_sched->threshold_fn;
+ if (!tmp_threshold) {
+ alu_sched->threshold_fn = _threshold_fn;
+ } else {
+ while (tmp_threshold->next) {
+ tmp_threshold = tmp_threshold->next;
+ }
+ tmp_threshold->next = _threshold_fn;
+ }
+ gf_log ("alu",
+ GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"",
+ alu_sched->entry_limit.disk_usage,
+ alu_sched->exit_limit.disk_usage);
+
+ } else if (strcmp (order_str, "write-usage") == 0) {
+ /* Handle "write-usage" */
+
+ _threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+ ERR_ABORT (_threshold_fn);
+ _threshold_fn->diff_value = get_max_diff_write_usage;
+ _threshold_fn->sched_value = get_stats_write_usage;
+ entry_fn = dict_get (xl->options,
+ "scheduler.alu.write-usage.entry-threshold");
+ if (entry_fn) {
+ if (gf_string2bytesize (entry_fn->data,
+ &alu_sched->entry_limit.write_usage) != 0) {
+ gf_log (xl->name, GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of option scheduler.alu."
+ "write-usage.entry-threshold",
+ entry_fn->data);
+ return -1;
+ }
+ } else {
+ alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->entry_value = get_stats_write_usage;
+ exit_fn = dict_get (xl->options,
+ "scheduler.alu.write-usage.exit-threshold");
+ if (exit_fn) {
+ if (gf_string2bytesize (exit_fn->data,
+ &alu_sched->exit_limit.write_usage) != 0) {
+ gf_log (xl->name, GF_LOG_ERROR,
+ "invalid number format \"%s\""
+ " of \"option scheduler.alu."
+ "write-usage.exit-threshold\"",
+ exit_fn->data);
+ return -1;
+ }
+ } else {
+ alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->exit_value = get_stats_write_usage;
+ tmp_threshold = alu_sched->threshold_fn;
+ if (!tmp_threshold) {
+ alu_sched->threshold_fn = _threshold_fn;
+ } else {
+ while (tmp_threshold->next) {
+ tmp_threshold = tmp_threshold->next;
+ }
+ tmp_threshold->next = _threshold_fn;
+ }
+ gf_log (xl->name, GF_LOG_DEBUG,
+ "alu_init: = %"PRId64",%"PRId64"",
+ alu_sched->entry_limit.write_usage,
+ alu_sched->exit_limit.write_usage);
+
+ } else if (strcmp (order_str, "read-usage") == 0) {
+ /* Read usage */
+
+ _threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+ ERR_ABORT (_threshold_fn);
+ _threshold_fn->diff_value = get_max_diff_read_usage;
+ _threshold_fn->sched_value = get_stats_read_usage;
+ entry_fn = dict_get (xl->options,
+ "scheduler.alu.read-usage.entry-threshold");
+ if (entry_fn) {
+ if (gf_string2bytesize (entry_fn->data,
+ &alu_sched->entry_limit.read_usage) != 0) {
+ gf_log (xl->name,
+ GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "read-usage.entry-threshold\"",
+ entry_fn->data);
+ return -1;
+ }
+ } else {
+ alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->entry_value = get_stats_read_usage;
+ exit_fn = dict_get (xl->options,
+ "scheduler.alu.read-usage.exit-threshold");
+ if (exit_fn)
+ {
+ if (gf_string2bytesize (exit_fn->data,
+ &alu_sched->exit_limit.read_usage) != 0)
+ {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "read-usage.exit-threshold\"",
+ exit_fn->data);
+ return -1;
+ }
+ }
+ else
+ {
+ alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->exit_value = get_stats_read_usage;
+ tmp_threshold = alu_sched->threshold_fn;
+ if (!tmp_threshold) {
+ alu_sched->threshold_fn = _threshold_fn;
+ }
+ else {
+ while (tmp_threshold->next) {
+ tmp_threshold = tmp_threshold->next;
+ }
+ tmp_threshold->next = _threshold_fn;
+ }
+ gf_log ("alu", GF_LOG_DEBUG,
+ "alu_init: = %"PRId64",%"PRId64"",
+ alu_sched->entry_limit.read_usage,
+ alu_sched->exit_limit.read_usage);
+
+ } else if (strcmp (order_str, "open-files-usage") == 0) {
+ /* Open files counter */
+
+ _threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+ ERR_ABORT (_threshold_fn);
+ _threshold_fn->diff_value = get_max_diff_file_usage;
+ _threshold_fn->sched_value = get_stats_file_usage;
+ entry_fn = dict_get (xl->options,
+ "scheduler.alu.open-files-usage.entry-threshold");
+ if (entry_fn) {
+ if (gf_string2uint64 (entry_fn->data,
+ &alu_sched->entry_limit.nr_files) != 0)
+ {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "open-files-usage.entry-"
+ "threshold\"", entry_fn->data);
+ return -1;
+ }
+ }
+ else
+ {
+ alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->entry_value = get_stats_file_usage;
+ exit_fn = dict_get (xl->options,
+ "scheduler.alu.open-files-usage.exit-threshold");
+ if (exit_fn)
+ {
+ if (gf_string2uint64 (exit_fn->data,
+ &alu_sched->exit_limit.nr_files) != 0)
+ {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.alu."
+ "open-files-usage.exit-"
+ "threshold\"", exit_fn->data);
+ return -1;
+ }
+ }
+ else
+ {
+ alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT;
+ }
+ _threshold_fn->exit_value = get_stats_file_usage;
+ tmp_threshold = alu_sched->threshold_fn;
+ if (!tmp_threshold) {
+ alu_sched->threshold_fn = _threshold_fn;
+ }
+ else {
+ while (tmp_threshold->next) {
+ tmp_threshold = tmp_threshold->next;
+ }
+ tmp_threshold->next = _threshold_fn;
+ }
+ gf_log ("alu", GF_LOG_DEBUG,
+ "alu.c->alu_init: = %"PRIu64",%"PRIu64"",
+ alu_sched->entry_limit.nr_files,
+ alu_sched->exit_limit.nr_files);
+
+ } else if (strcmp (order_str, "disk-speed-usage") == 0) {
+ /* Disk speed */
+
+ _threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+ ERR_ABORT (_threshold_fn);
+ _threshold_fn->diff_value = get_max_diff_disk_speed;
+ _threshold_fn->sched_value = get_stats_disk_speed;
+ entry_fn = dict_get (xl->options,
+ "scheduler.alu.disk-speed-usage.entry-threshold");
+ if (entry_fn) {
+ gf_log ("alu", GF_LOG_DEBUG,
+ "entry-threshold is given, "
+ "value is constant");
+ }
+ _threshold_fn->entry_value = NULL;
+ exit_fn = dict_get (xl->options,
+ "scheduler.alu.disk-speed-usage.exit-threshold");
+ if (exit_fn) {
+ gf_log ("alu", GF_LOG_DEBUG,
+ "exit-threshold is given, "
+ "value is constant");
+ }
+ _threshold_fn->exit_value = NULL;
+ tmp_threshold = alu_sched->threshold_fn;
+ if (!tmp_threshold) {
+ alu_sched->threshold_fn = _threshold_fn;
+ }
+ else {
+ while (tmp_threshold->next) {
+ tmp_threshold = tmp_threshold->next;
+ }
+ tmp_threshold->next = _threshold_fn;
+ }
+
+ } else {
+ gf_log ("alu", GF_LOG_DEBUG,
+ "%s, unknown option provided to scheduler",
+ order_str);
+ }
+ order_str = strtok_r (NULL, ":", &tmp_str);
+ }
+
+ return 0;
+}
+
+static int32_t
+alu_init (xlator_t *xl)
+{
+ struct alu_sched *alu_sched = NULL;
+ struct alu_limits *_limit_fn = NULL;
+ struct alu_limits *tmp_limits = NULL;
+ uint32_t min_free_disk = 0;
+ data_t *limits = NULL;
+
+ alu_sched = CALLOC (1, sizeof (struct alu_sched));
+ ERR_ABORT (alu_sched);
+
+ {
+ alu_parse_options (xl, alu_sched);
+ }
+
+ /* Get the limits */
+
+ limits = dict_get (xl->options,
+ "scheduler.limits.min-free-disk");
+ if (limits) {
+ _limit_fn = CALLOC (1, sizeof (struct alu_limits));
+ ERR_ABORT (_limit_fn);
+ _limit_fn->min_value = get_stats_free_disk;
+ _limit_fn->cur_value = get_stats_free_disk;
+ tmp_limits = alu_sched->limits_fn ;
+ _limit_fn->next = tmp_limits;
+ alu_sched->limits_fn = _limit_fn;
+
+ if (gf_string2percent (limits->data,
+ &min_free_disk) != 0) {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" "
+ "of \"option scheduler.limits."
+ "min-free-disk\"", limits->data);
+ return -1;
+ }
+ alu_sched->spec_limit.free_disk = min_free_disk;
+
+ if (alu_sched->spec_limit.free_disk >= 100) {
+ gf_log ("alu", GF_LOG_ERROR,
+ "check the \"option scheduler."
+ "limits.min-free-disk\", it should "
+ "be percentage value");
+ return -1;
+ }
+ alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */
+ gf_log ("alu", GF_LOG_DEBUG,
+ "alu.limit.min-disk-free = %"PRId64"",
+ _limit_fn->cur_value (&(alu_sched->spec_limit)));
+ }
+
+ limits = dict_get (xl->options,
+ "scheduler.limits.max-open-files");
+ if (limits) {
+ // Update alu_sched->priority properly
+ _limit_fn = CALLOC (1, sizeof (struct alu_limits));
+ ERR_ABORT (_limit_fn);
+ _limit_fn->max_value = get_stats_file_usage;
+ _limit_fn->cur_value = get_stats_file_usage;
+ tmp_limits = alu_sched->limits_fn ;
+ _limit_fn->next = tmp_limits;
+ alu_sched->limits_fn = _limit_fn;
+ if (gf_string2uint64_base10 (limits->data,
+ &alu_sched->spec_limit.nr_files) != 0)
+ {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format '%s' of option "
+ "scheduler.limits.max-open-files",
+ limits->data);
+ return -1;
+ }
+
+ gf_log ("alu", GF_LOG_DEBUG,
+ "alu_init: limit.max-open-files = %"PRId64"",
+ _limit_fn->cur_value (&(alu_sched->spec_limit)));
+ }
+
+
+ /* Stats refresh options */
+ limits = dict_get (xl->options,
+ "scheduler.refresh-interval");
+ if (limits) {
+ if (gf_string2time (limits->data,
+ &alu_sched->refresh_interval) != 0) {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "option scheduler.refresh-interval",
+ limits->data);
+ return -1;
+ }
+ } else {
+ alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT;
+ }
+ gettimeofday (&(alu_sched->last_stat_fetch), NULL);
+
+
+ limits = dict_get (xl->options,
+ "scheduler.alu.stat-refresh.num-file-create");
+ if (limits) {
+ if (gf_string2uint32 (limits->data,
+ &alu_sched->refresh_create_count) != 0)
+ {
+ gf_log ("alu", GF_LOG_ERROR,
+ "invalid number format \"%s\" of \"option "
+ "alu.stat-refresh.num-file-create\"",
+ limits->data);
+ return -1;
+ }
+ } else {
+ alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT;
+ }
+
+ {
+ /* Build an array of child_nodes */
+ struct alu_sched_struct *sched_array = NULL;
+ xlator_list_t *trav_xl = xl->children;
+ data_t *data = NULL;
+ int32_t index = 0;
+
+ while (trav_xl) {
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ alu_sched->child_count = index;
+ sched_array = CALLOC (index, sizeof (struct alu_sched_struct));
+ ERR_ABORT (sched_array);
+ trav_xl = xl->children;
+ index = 0;
+ while (trav_xl) {
+ sched_array[index].xl = trav_xl->xlator;
+ sched_array[index].eligible = 1;
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ alu_sched->array = sched_array;
+
+ data = dict_get (xl->options,
+ "scheduler.read-only-subvolumes");
+ if (data) {
+ char *child = NULL;
+ char *tmp = NULL;
+ char *childs_data = strdup (data->data);
+
+ child = strtok_r (childs_data, ",", &tmp);
+ while (child) {
+ for (index = 1; index < alu_sched->child_count; index++) {
+ if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) {
+ memcpy (&(alu_sched->array[index -1]),
+ &(alu_sched->array[alu_sched->child_count -1]),
+ sizeof (struct alu_sched_struct));
+ alu_sched->child_count--;
+ break;
+ }
+ }
+ child = strtok_r (NULL, ",", &tmp);
+ }
+ }
+ }
+
+ *((long *)xl->private) = (long)alu_sched;
+
+ /* Initialize all the alu_sched structure's elements */
+ {
+ alu_sched->sched_nodes_pending = 0;
+
+ alu_sched->min_limit.free_disk = 0x00FFFFFF;
+ alu_sched->min_limit.disk_usage = 0xFFFFFFFF;
+ alu_sched->min_limit.total_disk_size = 0xFFFFFFFF;
+ alu_sched->min_limit.disk_speed = 0xFFFFFFFF;
+ alu_sched->min_limit.write_usage = 0xFFFFFFFF;
+ alu_sched->min_limit.read_usage = 0xFFFFFFFF;
+ alu_sched->min_limit.nr_files = 0xFFFFFFFF;
+ alu_sched->min_limit.nr_clients = 0xFFFFFFFF;
+ }
+
+ pthread_mutex_init (&alu_sched->alu_mutex, NULL);
+ return 0;
+}
+
+static void
+alu_fini (xlator_t *xl)
+{
+ if (!xl)
+ return;
+ struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+ struct alu_limits *limit = alu_sched->limits_fn;
+ struct alu_threshold *threshold = alu_sched->threshold_fn;
+ void *tmp = NULL;
+ pthread_mutex_destroy (&alu_sched->alu_mutex);
+ free (alu_sched->array);
+ while (limit) {
+ tmp = limit;
+ limit = limit->next;
+ free (tmp);
+ }
+ while (threshold) {
+ tmp = threshold;
+ threshold = threshold->next;
+ free (tmp);
+ }
+ free (alu_sched);
+}
+
+static int32_t
+update_stat_array_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *trav_stats)
+{
+ struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+ struct alu_limits *limits_fn = alu_sched->limits_fn;
+ int32_t idx = 0;
+
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ for (idx = 0; idx < alu_sched->child_count; idx++) {
+ if (alu_sched->array[idx].xl == (xlator_t *)cookie)
+ break;
+ }
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+
+ if (op_ret == -1) {
+ alu_sched->array[idx].eligible = 0;
+ } else {
+ memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats));
+
+ /* Get stats from all the child node */
+ /* Here check the limits specified by the user to
+ consider the nodes to be used by scheduler */
+ alu_sched->array[idx].eligible = 1;
+ limits_fn = alu_sched->limits_fn;
+ while (limits_fn){
+ if (limits_fn->max_value &&
+ (limits_fn->cur_value (trav_stats) >
+ limits_fn->max_value (&(alu_sched->spec_limit)))) {
+ alu_sched->array[idx].eligible = 0;
+ }
+ if (limits_fn->min_value &&
+ (limits_fn->cur_value (trav_stats) <
+ limits_fn->min_value (&(alu_sched->spec_limit)))) {
+ alu_sched->array[idx].eligible = 0;
+ }
+ limits_fn = limits_fn->next;
+ }
+
+ /* Select minimum and maximum disk_usage */
+ if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) {
+ alu_sched->max_limit.disk_usage = trav_stats->disk_usage;
+ }
+ if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) {
+ alu_sched->min_limit.disk_usage = trav_stats->disk_usage;
+ }
+
+ /* Select minimum and maximum disk_speed */
+ if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) {
+ alu_sched->max_limit.disk_speed = trav_stats->disk_speed;
+ }
+ if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) {
+ alu_sched->min_limit.disk_speed = trav_stats->disk_speed;
+ }
+
+ /* Select minimum and maximum number of open files */
+ if (trav_stats->nr_files > alu_sched->max_limit.nr_files) {
+ alu_sched->max_limit.nr_files = trav_stats->nr_files;
+ }
+ if (trav_stats->nr_files < alu_sched->min_limit.nr_files) {
+ alu_sched->min_limit.nr_files = trav_stats->nr_files;
+ }
+
+ /* Select minimum and maximum write-usage */
+ if (trav_stats->write_usage > alu_sched->max_limit.write_usage) {
+ alu_sched->max_limit.write_usage = trav_stats->write_usage;
+ }
+ if (trav_stats->write_usage < alu_sched->min_limit.write_usage) {
+ alu_sched->min_limit.write_usage = trav_stats->write_usage;
+ }
+
+ /* Select minimum and maximum read-usage */
+ if (trav_stats->read_usage > alu_sched->max_limit.read_usage) {
+ alu_sched->max_limit.read_usage = trav_stats->read_usage;
+ }
+ if (trav_stats->read_usage < alu_sched->min_limit.read_usage) {
+ alu_sched->min_limit.read_usage = trav_stats->read_usage;
+ }
+
+ /* Select minimum and maximum free-disk */
+ if (trav_stats->free_disk > alu_sched->max_limit.free_disk) {
+ alu_sched->max_limit.free_disk = trav_stats->free_disk;
+ }
+ if (trav_stats->free_disk < alu_sched->min_limit.free_disk) {
+ alu_sched->min_limit.free_disk = trav_stats->free_disk;
+ }
+ }
+
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+static void
+update_stat_array (xlator_t *xl)
+{
+ /* This function schedules the file in one of the child nodes */
+ struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+ int32_t idx = 0;
+ call_frame_t *frame = NULL;
+ call_pool_t *pool = xl->ctx->pool;
+
+ for (idx = 0 ; idx < alu_sched->child_count; idx++) {
+ frame = create_frame (xl, pool);
+
+ STACK_WIND_COOKIE (frame,
+ update_stat_array_cbk,
+ alu_sched->array[idx].xl, //cookie
+ alu_sched->array[idx].xl,
+ (alu_sched->array[idx].xl)->mops->stats,
+ 0); //flag
+ }
+ return;
+}
+
+static void
+alu_update (xlator_t *xl)
+{
+ struct timeval tv;
+ struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+
+ gettimeofday (&tv, NULL);
+ if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) {
+ /* Update the stats from all the server */
+ update_stat_array (xl);
+ alu_sched->last_stat_fetch.tv_sec = tv.tv_sec;
+ }
+}
+
+static xlator_t *
+alu_scheduler (xlator_t *xl, const void *path)
+{
+ /* This function schedules the file in one of the child nodes */
+ struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+ int32_t sched_index = 0;
+ int32_t sched_index_orig = 0;
+ int32_t idx = 0;
+
+ alu_update (xl);
+
+ /* Now check each threshold one by one if some nodes are classified */
+ {
+ struct alu_threshold *trav_threshold = alu_sched->threshold_fn;
+ struct alu_threshold *tmp_threshold = alu_sched->sched_method;
+ struct alu_sched_node *tmp_sched_node;
+
+ /* This pointer 'trav_threshold' contains function pointers according to spec file
+ give by user, */
+ while (trav_threshold) {
+ /* This check is needed for seeing if already there are nodes in this criteria
+ to be scheduled */
+ if (!alu_sched->sched_nodes_pending) {
+ for (idx = 0; idx < alu_sched->child_count; idx++) {
+ if (!alu_sched->array[idx].eligible) {
+ continue;
+ }
+ if (trav_threshold->entry_value) {
+ if (trav_threshold->diff_value (&(alu_sched->max_limit),
+ &(alu_sched->array[idx].stats)) <
+ trav_threshold->entry_value (&(alu_sched->entry_limit))) {
+ continue;
+ }
+ }
+ tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node));
+ ERR_ABORT (tmp_sched_node);
+ tmp_sched_node->index = idx;
+ if (!alu_sched->sched_node) {
+ alu_sched->sched_node = tmp_sched_node;
+ } else {
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ tmp_sched_node->next = alu_sched->sched_node;
+ alu_sched->sched_node = tmp_sched_node;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ }
+ alu_sched->sched_nodes_pending++;
+ }
+ } /* end of if (sched_nodes_pending) */
+
+ /* This loop is required to check the eligible nodes */
+ struct alu_sched_node *trav_sched_node;
+ while (alu_sched->sched_nodes_pending) {
+ trav_sched_node = alu_sched->sched_node;
+ sched_index = trav_sched_node->index;
+ if (alu_sched->array[sched_index].eligible)
+ break;
+ alu_sched->sched_node = trav_sched_node->next;
+ free (trav_sched_node);
+ alu_sched->sched_nodes_pending--;
+ }
+ if (alu_sched->sched_nodes_pending) {
+ /* There are some node in this criteria to be scheduled, no need
+ * to sort and check other methods
+ */
+ if (tmp_threshold && tmp_threshold->exit_value) {
+ /* verify the exit value && whether node is eligible or not */
+ if (tmp_threshold->diff_value (&(alu_sched->max_limit),
+ &(alu_sched->array[sched_index].stats)) >
+ tmp_threshold->exit_value (&(alu_sched->exit_limit))) {
+ /* Free the allocated info for the node :) */
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ alu_sched->sched_node = trav_sched_node->next;
+ free (trav_sched_node);
+ trav_sched_node = alu_sched->sched_node;
+ alu_sched->sched_nodes_pending--;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ }
+ } else {
+ /* if there is no exit value, then exit after scheduling once */
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ alu_sched->sched_node = trav_sched_node->next;
+ free (trav_sched_node);
+ trav_sched_node = alu_sched->sched_node;
+ alu_sched->sched_nodes_pending--;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ }
+
+ alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */
+
+ /* */
+ if (trav_sched_node) {
+ tmp_sched_node = trav_sched_node;
+ while (trav_sched_node->next) {
+ trav_sched_node = trav_sched_node->next;
+ }
+ if (tmp_sched_node->next) {
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ alu_sched->sched_node = tmp_sched_node->next;
+ tmp_sched_node->next = NULL;
+ trav_sched_node->next = tmp_sched_node;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ }
+ }
+ /* return the scheduled node */
+ return alu_sched->array[sched_index].xl;
+ } /* end of if (pending_nodes) */
+
+ tmp_threshold = trav_threshold;
+ trav_threshold = trav_threshold->next;
+ }
+ }
+
+ /* This is used only when there is everything seems ok, or no eligible nodes */
+ sched_index_orig = alu_sched->sched_index;
+ alu_sched->sched_method = NULL;
+ while (1) {
+ //lock
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ sched_index = alu_sched->sched_index++;
+ alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ //unlock
+ if (alu_sched->array[sched_index].eligible)
+ break;
+ if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) {
+ gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule");
+ //lock
+ pthread_mutex_lock (&alu_sched->alu_mutex);
+ alu_sched->sched_index++;
+ alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count;
+ pthread_mutex_unlock (&alu_sched->alu_mutex);
+ //unlock
+ break;
+ }
+ }
+ return alu_sched->array[sched_index].xl;
+}
+
+/**
+ * notify
+ */
+void
+alu_notify (xlator_t *xl, int32_t event, void *data)
+{
+ struct alu_sched *alu_sched = NULL;
+ int32_t idx = 0;
+
+ alu_sched = (struct alu_sched *)*((long *)xl->private);
+ if (!alu_sched)
+ return;
+
+ for (idx = 0; idx < alu_sched->child_count; idx++) {
+ if (alu_sched->array[idx].xl == (xlator_t *)data)
+ break;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ //alu_sched->array[idx].eligible = 1;
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ alu_sched->array[idx].eligible = 0;
+ }
+ break;
+ default:
+ {
+ ;
+ }
+ break;
+ }
+
+}
+
+struct sched_ops sched = {
+ .init = alu_init,
+ .fini = alu_fini,
+ .update = alu_update,
+ .schedule = alu_scheduler,
+ .notify = alu_notify
+};
+
+struct volume_options options[] = {
+ { .key = { "scheduler.alu.order", "alu.order" },
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = { "scheduler.alu.disk-usage.entry-threshold",
+ "alu.disk-usage.entry-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.disk-usage.exit-threshold",
+ "alu.disk-usage.exit-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.write-usage.entry-threshold",
+ "alu.write-usage.entry-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.write-usage.exit-threshold",
+ "alu.write-usage.exit-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.read-usage.entry-threshold",
+ "alu.read-usage.entry-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.read-usage.exit-threshold",
+ "alu.read-usage.exit-threshold" },
+ .type = GF_OPTION_TYPE_SIZET
+ },
+ { .key = { "scheduler.alu.open-files-usage.entry-threshold",
+ "alu.open-files-usage.entry-threshold" },
+ .type = GF_OPTION_TYPE_INT
+ },
+ { .key = { "scheduler.alu.open-files-usage.exit-threshold",
+ "alu.open-files-usage.exit-threshold" },
+ .type = GF_OPTION_TYPE_INT
+ },
+ { .key = { "scheduler.read-only-subvolumes",
+ "alu.read-only-subvolumes" },
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = { "scheduler.refresh-interval",
+ "alu.refresh-interval",
+ "alu.stat-refresh.interval" },
+ .type = GF_OPTION_TYPE_TIME
+ },
+ { .key = { "scheduler.limits.min-free-disk",
+ "alu.limits.min-free-disk" },
+ .type = GF_OPTION_TYPE_PERCENT
+ },
+ { .key = { "scheduler.alu.stat-refresh.num-file-create"
+ "alu.stat-refresh.num-file-create"},
+ .type = GF_OPTION_TYPE_INT
+ },
+ { .key = {NULL}, }
+};
diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h
new file mode 100644
index 00000000000..a958bb4d298
--- /dev/null
+++ b/scheduler/alu/src/alu.h
@@ -0,0 +1,89 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _ALU_H
+#define _ALU_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+
+struct alu_sched;
+
+struct alu_sched_struct {
+ xlator_t *xl;
+ struct xlator_stats stats;
+ unsigned char eligible;
+};
+
+// Write better name for these functions
+struct alu_limits {
+ struct alu_limits *next;
+ int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */
+ int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */
+ int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */
+};
+
+struct alu_threshold {
+ struct alu_threshold *next;
+ int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */
+ int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */
+ int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */
+ int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */
+};
+
+struct alu_sched_node {
+ struct alu_sched_node *next;
+ int32_t index;
+};
+
+struct alu_sched {
+ struct alu_limits *limits_fn;
+ struct alu_threshold *threshold_fn;
+ struct alu_sched_struct *array;
+ struct alu_sched_node *sched_node;
+ struct alu_threshold *sched_method;
+ struct xlator_stats max_limit;
+ struct xlator_stats min_limit;
+ struct xlator_stats entry_limit;
+ struct xlator_stats exit_limit;
+ struct xlator_stats spec_limit; /* User given limit */
+
+ pthread_mutex_t alu_mutex;
+ struct timeval last_stat_fetch;
+ uint32_t refresh_interval; /* in seconds */
+ uint32_t refresh_create_count; /* num-file-create */
+
+ int32_t sched_nodes_pending;
+
+ int32_t sched_index; /* used for round robin scheduling in case of many nodes getting the criteria match. */
+ int32_t child_count;
+
+};
+
+struct _alu_local_t {
+ int32_t call_count;
+};
+
+typedef struct _alu_local_t alu_local_t;
+
+#endif /* _ALU_H */
diff --git a/scheduler/nufa/Makefile.am b/scheduler/nufa/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/scheduler/nufa/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/scheduler/nufa/src/Makefile.am b/scheduler/nufa/src/Makefile.am
new file mode 100644
index 00000000000..6eb3d39f1e2
--- /dev/null
+++ b/scheduler/nufa/src/Makefile.am
@@ -0,0 +1,12 @@
+sched_LTLIBRARIES = nufa.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+nufa_la_LDFLAGS = -module -avoidversion
+
+nufa_la_SOURCES = nufa.c
+nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/scheduler/nufa/src/nufa.c b/scheduler/nufa/src/nufa.c
new file mode 100644
index 00000000000..bbc61e2ad6f
--- /dev/null
+++ b/scheduler/nufa/src/nufa.c
@@ -0,0 +1,403 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "common-utils.h"
+
+struct nufa_sched_struct {
+ xlator_t *xl;
+ struct timeval last_stat_fetch;
+ int64_t free_disk;
+ int32_t refresh_interval;
+ unsigned char eligible;
+};
+
+struct nufa_struct {
+ struct nufa_sched_struct *array;
+ struct timeval last_stat_fetch;
+
+ int32_t *local_array; /* Used to keep the index of the local xlators */
+ int32_t local_xl_index; /* index in the above array */
+ int32_t local_xl_count; /* Count of the local subvolumes */
+
+ uint32_t refresh_interval;
+ uint32_t min_free_disk;
+
+ gf_lock_t nufa_lock;
+ int32_t child_count;
+ int32_t sched_index;
+};
+
+#define NUFA_LIMITS_MIN_FREE_DISK_DEFAULT 15
+#define NUFA_REFRESH_INTERVAL_DEFAULT 30
+
+static int32_t
+nufa_init (xlator_t *xl)
+{
+ int32_t index = 0;
+ data_t *local_name = NULL;
+ data_t *data = NULL;
+ xlator_list_t *trav_xl = xl->children;
+ struct nufa_struct *nufa_buf = NULL;
+
+ nufa_buf = CALLOC (1, sizeof (struct nufa_struct));
+ ERR_ABORT (nufa_buf);
+
+ data = dict_get (xl->options, "scheduler.limits.min-free-disk");
+ if (data) {
+ if (gf_string2percent (data->data,
+ &nufa_buf->min_free_disk) != 0) {
+ gf_log ("nufa", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "\"option scheduler.limits.min-free-disk\"",
+ data->data);
+ return -1;
+ }
+ if (nufa_buf->min_free_disk >= 100) {
+ gf_log ("nufa", GF_LOG_ERROR,
+ "check \"option scheduler.limits.min-free-disk"
+ "\", it should be percentage value");
+ return -1;
+ }
+ } else {
+ gf_log ("nufa", GF_LOG_WARNING,
+ "No option for limit min-free-disk given, "
+ "defaulting it to 15%%");
+ nufa_buf->min_free_disk = NUFA_LIMITS_MIN_FREE_DISK_DEFAULT;
+ }
+ data = dict_get (xl->options, "scheduler.refresh-interval");
+ if (data && (gf_string2time (data->data,
+ &nufa_buf->refresh_interval) != 0)) {
+ gf_log ("nufa", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "\"option scheduler.refresh-interval\"",
+ data->data);
+ return -1;
+ } else {
+ gf_log ("nufa", GF_LOG_WARNING,
+ "No option for scheduler.refresh-interval given, "
+ "defaulting it to 30");
+ nufa_buf->refresh_interval = NUFA_REFRESH_INTERVAL_DEFAULT;
+ }
+
+ /* Get the array built */
+ while (trav_xl) {
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ nufa_buf->child_count = index;
+ nufa_buf->sched_index = 0;
+ nufa_buf->array = CALLOC (index, sizeof (struct nufa_sched_struct));
+ ERR_ABORT (nufa_buf->array);
+ nufa_buf->local_array = CALLOC (index, sizeof (int32_t));
+ ERR_ABORT (nufa_buf->array);
+ trav_xl = xl->children;
+
+ local_name = dict_get (xl->options, "scheduler.local-volume-name");
+ if (!local_name) {
+ /* Error */
+ gf_log ("nufa", GF_LOG_ERROR,
+ "No 'local-volume-name' option given in volume file");
+ FREE (nufa_buf->array);
+ FREE (nufa_buf->local_array);
+ FREE (nufa_buf);
+ return -1;
+ }
+
+ /* Get the array properly */
+ index = 0;
+ trav_xl = xl->children;
+ while (trav_xl) {
+ nufa_buf->array[index].xl = trav_xl->xlator;
+ nufa_buf->array[index].eligible = 1;
+ nufa_buf->array[index].free_disk = nufa_buf->min_free_disk;
+ nufa_buf->array[index].refresh_interval =
+ nufa_buf->refresh_interval;
+
+ trav_xl = trav_xl->next;
+ index++;
+ }
+
+ {
+ int32_t array_index = 0;
+ char *child = NULL;
+ char *tmp = NULL;
+ char *childs_data = strdup (local_name->data);
+
+ child = strtok_r (childs_data, ",", &tmp);
+ while (child) {
+ /* Check if the local_volume specified is proper
+ subvolume of unify */
+ trav_xl = xl->children;
+ index=0;
+ while (trav_xl) {
+ if (strcmp (child, trav_xl->xlator->name) == 0)
+ break;
+ trav_xl = trav_xl->next;
+ index++;
+ }
+
+ if (!trav_xl) {
+ /* entry for 'local-volume-name' is wrong,
+ not present in subvolumes */
+ gf_log ("nufa", GF_LOG_ERROR,
+ "option 'scheduler.local-volume-name' "
+ "%s is wrong", child);
+ FREE (nufa_buf->array);
+ FREE (nufa_buf->local_array);
+ FREE (nufa_buf);
+ return -1;
+ } else {
+ nufa_buf->local_array[array_index++] = index;
+ nufa_buf->local_xl_count++;
+ }
+ child = strtok_r (NULL, ",", &tmp);
+ }
+ free (childs_data);
+ }
+
+ LOCK_INIT (&nufa_buf->nufa_lock);
+ *((long *)xl->private) = (long)nufa_buf; // put it at the proper place
+ return 0;
+}
+
+static void
+nufa_fini (xlator_t *xl)
+{
+ struct nufa_struct *nufa_buf =
+ (struct nufa_struct *)*((long *)xl->private);
+
+ LOCK_DESTROY (&nufa_buf->nufa_lock);
+ FREE (nufa_buf->local_array);
+ FREE (nufa_buf->array);
+ FREE (nufa_buf);
+}
+
+static int32_t
+update_stat_array_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *trav_stats)
+{
+ struct nufa_struct *nufa_struct = NULL;
+ int32_t idx = 0;
+ int32_t percent = 0;
+
+ nufa_struct = (struct nufa_struct *)*((long *)xl->private);
+ LOCK (&nufa_struct->nufa_lock);
+ for (idx = 0; idx < nufa_struct->child_count; idx++) {
+ if (nufa_struct->array[idx].xl->name == (char *)cookie)
+ break;
+ }
+ UNLOCK (&nufa_struct->nufa_lock);
+
+ if (op_ret == 0) {
+ percent = ((trav_stats->free_disk * 100) /
+ trav_stats->total_disk_size);
+ if (nufa_struct->array[idx].free_disk > percent) {
+ if (nufa_struct->array[idx].eligible)
+ gf_log ("nufa", GF_LOG_CRITICAL,
+ "node \"%s\" is _almost_ (%d %%) full",
+ nufa_struct->array[idx].xl->name,
+ 100 - percent);
+ nufa_struct->array[idx].eligible = 0;
+ } else {
+ nufa_struct->array[idx].eligible = 1;
+ }
+ } else {
+ nufa_struct->array[idx].eligible = 0;
+ }
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+static void
+update_stat_array (xlator_t *xl)
+{
+ /* This function schedules the file in one of the child nodes */
+ int32_t idx;
+ struct nufa_struct *nufa_buf =
+ (struct nufa_struct *)*((long *)xl->private);
+ call_frame_t *frame = NULL;
+ call_pool_t *pool = xl->ctx->pool;
+
+ for (idx = 0; idx < nufa_buf->child_count; idx++) {
+ frame = create_frame (xl, pool);
+
+ STACK_WIND_COOKIE (frame,
+ update_stat_array_cbk,
+ nufa_buf->array[idx].xl->name,
+ nufa_buf->array[idx].xl,
+ (nufa_buf->array[idx].xl)->mops->stats,
+ 0); //flag
+ }
+
+ return;
+}
+
+static void
+nufa_update (xlator_t *xl)
+{
+ struct nufa_struct *nufa_buf =
+ (struct nufa_struct *)*((long *)xl->private);
+ struct timeval tv;
+ gettimeofday (&tv, NULL);
+ if (tv.tv_sec > (nufa_buf->refresh_interval
+ + nufa_buf->last_stat_fetch.tv_sec)) {
+ /* Update the stats from all the server */
+ update_stat_array (xl);
+ nufa_buf->last_stat_fetch.tv_sec = tv.tv_sec;
+ }
+}
+
+static xlator_t *
+nufa_schedule (xlator_t *xl, const void *path)
+{
+ struct nufa_struct *nufa_buf =
+ (struct nufa_struct *)*((long *)xl->private);
+ int32_t nufa_orig = nufa_buf->local_xl_index;
+ int32_t rr;
+
+ nufa_update (xl);
+
+ while (1) {
+ LOCK (&nufa_buf->nufa_lock);
+ rr = nufa_buf->local_xl_index++;
+ nufa_buf->local_xl_index %= nufa_buf->local_xl_count;
+ UNLOCK (&nufa_buf->nufa_lock);
+
+ /* if 'eligible' or there are _no_ eligible nodes */
+ if (nufa_buf->array[nufa_buf->local_array[rr]].eligible) {
+ /* Return the local node */
+ return nufa_buf->array[nufa_buf->local_array[rr]].xl;
+ }
+ if ((rr + 1) % nufa_buf->local_xl_count == nufa_orig) {
+ gf_log ("nufa", GF_LOG_CRITICAL,
+ "No free space available on any local "
+ "volumes, using RR scheduler");
+ LOCK (&nufa_buf->nufa_lock);
+ nufa_buf->local_xl_index++;
+ nufa_buf->local_xl_index %= nufa_buf->local_xl_count;
+ UNLOCK (&nufa_buf->nufa_lock);
+ break;
+ }
+ }
+
+ nufa_orig = nufa_buf->sched_index;
+ while (1) {
+ LOCK (&nufa_buf->nufa_lock);
+ rr = nufa_buf->sched_index++;
+ nufa_buf->sched_index = (nufa_buf->sched_index %
+ nufa_buf->child_count);
+ UNLOCK (&nufa_buf->nufa_lock);
+
+ /* if 'eligible' or there are _no_ eligible nodes */
+ if (nufa_buf->array[rr].eligible) {
+ break;
+ }
+ if ((rr + 1) % nufa_buf->child_count == nufa_orig) {
+ gf_log ("nufa", GF_LOG_CRITICAL,
+ "No free space available on any server, "
+ "using RR scheduler.");
+ LOCK (&nufa_buf->nufa_lock);
+ nufa_buf->sched_index++;
+ nufa_buf->sched_index = (nufa_buf->sched_index %
+ nufa_buf->child_count);
+ UNLOCK (&nufa_buf->nufa_lock);
+ break;
+ }
+ }
+ return nufa_buf->array[rr].xl;
+}
+
+
+/**
+ * notify
+ */
+void
+nufa_notify (xlator_t *xl, int32_t event, void *data)
+{
+ struct nufa_struct *nufa_buf =
+ (struct nufa_struct *)*((long *)xl->private);
+ int32_t idx = 0;
+
+ if (!nufa_buf)
+ return;
+
+ for (idx = 0; idx < nufa_buf->child_count; idx++) {
+ if (strcmp (nufa_buf->array[idx].xl->name,
+ ((xlator_t *)data)->name) == 0)
+ break;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ //nufa_buf->array[idx].eligible = 1;
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ nufa_buf->array[idx].eligible = 0;
+ }
+ break;
+ default:
+ {
+ ;
+ }
+ break;
+ }
+
+}
+
+struct sched_ops sched = {
+ .init = nufa_init,
+ .fini = nufa_fini,
+ .update = nufa_update,
+ .schedule = nufa_schedule,
+ .notify = nufa_notify
+};
+
+struct volume_options options[] = {
+ { .key = { "scheduler.refresh-interval",
+ "nufa.refresh-interval" },
+ .type = GF_OPTION_TYPE_TIME
+ },
+ { .key = { "scheduler.limits.min-free-disk",
+ "nufa.limits.min-free-disk" },
+ .type = GF_OPTION_TYPE_PERCENT
+ },
+ { .key = { "scheduler.local-volume-name",
+ "nufa.local-volume-name" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = {NULL} }
+};
+
diff --git a/scheduler/random/Makefile.am b/scheduler/random/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/scheduler/random/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/scheduler/random/src/Makefile.am b/scheduler/random/src/Makefile.am
new file mode 100644
index 00000000000..572181336c2
--- /dev/null
+++ b/scheduler/random/src/Makefile.am
@@ -0,0 +1,14 @@
+sched_LTLIBRARIES = random.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+random_la_LDFLAGS = -module -avoidversion
+
+random_la_SOURCES = random.c
+random_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = random.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/scheduler/random/src/random.c b/scheduler/random/src/random.c
new file mode 100644
index 00000000000..9e761d08a54
--- /dev/null
+++ b/scheduler/random/src/random.c
@@ -0,0 +1,283 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdlib.h>
+#include <sys/time.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "random.h"
+
+#define RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT 15
+#define RANDOM_REFRESH_INTERVAL_DEFAULT 10
+
+
+static int32_t
+random_init (xlator_t *xl)
+{
+ struct random_struct *random_buf = NULL;
+ xlator_list_t *trav_xl = xl->children;
+ data_t *limit = NULL;
+ int32_t index = 0;
+
+ random_buf = CALLOC (1, sizeof (struct random_struct));
+ ERR_ABORT (random_buf);
+
+ /* Set the seed for the 'random' function */
+ srandom ((uint32_t) time (NULL));
+
+ limit = dict_get (xl->options, "scheduler.limits.min-free-disk");
+ if (limit) {
+ if (gf_string2percent (data_to_str (limit),
+ &random_buf->min_free_disk) != 0) {
+ gf_log ("random", GF_LOG_ERROR,
+ "invalid number format \"%s\" of \"option "
+ "scheduler.limits.min-free-disk\"",
+ limit->data);
+ return -1;
+ }
+ if (random_buf->min_free_disk >= 100) {
+ gf_log ("random", GF_LOG_ERROR,
+ "check the \"option random.limits.min-free"
+ "-disk\", it should be percentage value");
+ return -1;
+ }
+
+ } else {
+ gf_log ("random", GF_LOG_WARNING,
+ "No option for limit min-free-disk given, "
+ "defaulting it to 5%%");
+ random_buf->min_free_disk =
+ RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT;
+ }
+
+ limit = dict_get (xl->options, "scheduler.refresh-interval");
+ if (limit) {
+ if (gf_string2time (data_to_str (limit),
+ &random_buf->refresh_interval) != 0) {
+ gf_log ("random", GF_LOG_ERROR,
+ "invalid number format \"%s\" of "
+ "\"option random.refresh-interval\"",
+ limit->data);
+ return -1;
+ }
+ } else {
+ random_buf->refresh_interval = RANDOM_REFRESH_INTERVAL_DEFAULT;
+ }
+
+ while (trav_xl) {
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ random_buf->child_count = index;
+ random_buf->array = CALLOC (index,
+ sizeof (struct random_sched_struct));
+ ERR_ABORT (random_buf->array);
+ trav_xl = xl->children;
+ index = 0;
+
+ while (trav_xl) {
+ random_buf->array[index].xl = trav_xl->xlator;
+ random_buf->array[index].eligible = 1;
+ trav_xl = trav_xl->next;
+ index++;
+ }
+ pthread_mutex_init (&random_buf->random_mutex, NULL);
+
+ // put it at the proper place
+ *((long *)xl->private) = (long)random_buf;
+ return 0;
+}
+
+static void
+random_fini (xlator_t *xl)
+{
+ struct random_struct *random_buf = NULL;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+ pthread_mutex_destroy (&random_buf->random_mutex);
+ free (random_buf->array);
+ free (random_buf);
+}
+
+
+static int32_t
+update_stat_array_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *trav_stats)
+{
+ int32_t idx = 0;
+ int32_t percent = 0;
+ struct random_struct *random_buf = NULL;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+
+ pthread_mutex_lock (&random_buf->random_mutex);
+ for (idx = 0; idx < random_buf->child_count; idx++) {
+ if (strcmp (random_buf->array[idx].xl->name,
+ (char *)cookie) == 0)
+ break;
+ }
+ pthread_mutex_unlock (&random_buf->random_mutex);
+
+ if (op_ret == 0) {
+ percent = ((trav_stats->free_disk *100) /
+ trav_stats->total_disk_size);
+ if (random_buf->min_free_disk > percent) {
+ random_buf->array[idx].eligible = 0;
+ } else {
+ random_buf->array[idx].eligible = 1;
+ }
+ } else {
+ random_buf->array[idx].eligible = 0;
+ }
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+static void
+update_stat_array (xlator_t *xl)
+{
+ int32_t idx;
+ struct random_struct *random_buf = NULL;
+ call_frame_t *frame = NULL;
+ call_pool_t *pool = xl->ctx->pool;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+ for (idx = 0; idx < random_buf->child_count; idx++) {
+ frame = create_frame (xl, pool);
+
+ STACK_WIND_COOKIE (frame,
+ update_stat_array_cbk,
+ random_buf->array[idx].xl->name,
+ random_buf->array[idx].xl,
+ (random_buf->array[idx].xl)->mops->stats,
+ 0);
+ }
+ return ;
+}
+
+static void
+random_update (xlator_t *xl)
+{
+ struct timeval tv;
+ struct random_struct *random_buf = NULL;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+
+ gettimeofday(&tv, NULL);
+ if (tv.tv_sec > (random_buf->refresh_interval +
+ random_buf->last_stat_entry.tv_sec)) {
+ update_stat_array (xl);
+ random_buf->last_stat_entry.tv_sec = tv.tv_sec;
+ }
+}
+
+static xlator_t *
+random_schedule (xlator_t *xl, const void *path)
+{
+ struct random_struct *random_buf = NULL;
+ int32_t rand = 0;
+ int32_t try = 0;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+
+ rand = (int32_t) (1.0*random_buf->child_count *
+ (random() / (RAND_MAX + 1.0)));
+
+ random_update (xl);
+
+ while (!random_buf->array[rand].eligible) {
+ if (try++ > 100) {
+ /* there is a chance of this becoming a
+ infinite loop otherwise. */
+ break;
+ }
+ rand = (int32_t) (1.0*random_buf->child_count *
+ (random() / (RAND_MAX + 1.0)));
+ }
+ return random_buf->array[rand].xl;
+}
+
+
+/**
+ * notify
+ */
+void
+random_notify (xlator_t *xl, int32_t event, void *data)
+{
+ struct random_struct *random_buf = NULL;
+ int32_t idx = 0;
+
+ random_buf = (struct random_struct *)*((long *)xl->private);
+ if (!random_buf)
+ return;
+
+ for (idx = 0; idx < random_buf->child_count; idx++) {
+ if (random_buf->array[idx].xl == (xlator_t *)data)
+ break;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ //random_buf->array[idx].eligible = 1;
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ random_buf->array[idx].eligible = 0;
+ }
+ break;
+ default:
+ {
+ ;
+ }
+ break;
+ }
+
+}
+
+struct sched_ops sched = {
+ .init = random_init,
+ .fini = random_fini,
+ .update = random_update,
+ .schedule = random_schedule,
+ .notify = random_notify
+};
+
+struct volume_options options[] = {
+ { .key = { "scheduler.refresh-interval",
+ "random.refresh-interval" },
+ .type = GF_OPTION_TYPE_TIME
+ },
+ { .key = { "scheduler.limits.min-free-disk",
+ "random.limits.min-free-disk" },
+ .type = GF_OPTION_TYPE_PERCENT
+ },
+ { .key = {NULL} }
+};
diff --git a/scheduler/random/src/random.h b/scheduler/random/src/random.h
new file mode 100644
index 00000000000..35c9e02ee2b
--- /dev/null
+++ b/scheduler/random/src/random.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RANDOM_H
+#define _RANDOM_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <sys/time.h>
+#include "scheduler.h"
+
+struct random_sched_struct {
+ xlator_t *xl;
+ unsigned char eligible;
+};
+
+struct random_struct {
+ int32_t child_count;
+ uint32_t refresh_interval;
+ uint32_t min_free_disk;
+ struct timeval last_stat_entry;
+ struct random_sched_struct *array;
+ pthread_mutex_t random_mutex;
+};
+
+#endif /* _RANDOM_H */
diff --git a/scheduler/rr/Makefile.am b/scheduler/rr/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/scheduler/rr/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/scheduler/rr/src/Makefile.am b/scheduler/rr/src/Makefile.am
new file mode 100644
index 00000000000..7e911c0eda8
--- /dev/null
+++ b/scheduler/rr/src/Makefile.am
@@ -0,0 +1,13 @@
+sched_LTLIBRARIES = rr.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+rr_la_LDFLAGS = -module -avoidversion
+
+rr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+rr_la_SOURCES = rr.c rr-options.c
+noinst_HEADERS = rr.h rr-options.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/scheduler/rr/src/rr-options.c b/scheduler/rr/src/rr-options.c
new file mode 100644
index 00000000000..3f0ffcaf2e9
--- /dev/null
+++ b/scheduler/rr/src/rr-options.c
@@ -0,0 +1,256 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include "scheduler.h"
+#include "rr-options.h"
+
+#define RR_LIMITS_MIN_FREE_DISK_OPTION_STRING "scheduler.limits.min-free-disk"
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT 15
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_MIN 0
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_MAX 100
+
+#define RR_REFRESH_INTERVAL_OPTION_STRING "scheduler.refresh-interval"
+#define RR_REFRESH_INTERVAL_VALUE_DEFAULT 10
+
+#define RR_READ_ONLY_SUBVOLUMES_OPTION_STRING "scheduler.read-only-subvolumes"
+
+#define LOG_ERROR(args...) gf_log ("rr-options", GF_LOG_ERROR, ##args)
+#define LOG_WARNING(args...) gf_log ("rr-options", GF_LOG_WARNING, ##args)
+
+static int
+_rr_options_min_free_disk_validate (const char *value_string, uint32_t *n)
+{
+ uint32_t value = 0;
+
+ if (value_string == NULL)
+ {
+ return -1;
+ }
+
+ if (gf_string2percent (value_string, &value) != 0)
+ {
+ gf_log ("rr",
+ GF_LOG_ERROR,
+ "invalid number format [%s] of option [%s]",
+ value_string,
+ RR_LIMITS_MIN_FREE_DISK_OPTION_STRING);
+ return -1;
+ }
+
+ if ((value <= RR_LIMITS_MIN_FREE_DISK_VALUE_MIN) ||
+ (value >= RR_LIMITS_MIN_FREE_DISK_VALUE_MAX))
+ {
+ gf_log ("rr",
+ GF_LOG_ERROR,
+ "out of range [%d] of option [%s]. Allowed range is 0 to 100.",
+ value,
+ RR_LIMITS_MIN_FREE_DISK_OPTION_STRING);
+ return -1;
+ }
+
+ *n = value;
+
+ return 0;
+}
+
+static int
+_rr_options_refresh_interval_validate (const char *value_string, uint32_t *n)
+{
+ uint32_t value = 0;
+
+ if (value_string == NULL)
+ {
+ return -1;
+ }
+
+ if (gf_string2time (value_string, &value) != 0)
+ {
+ gf_log ("rr",
+ GF_LOG_ERROR,
+ "invalid number format [%s] of option [%s]",
+ value_string,
+ RR_REFRESH_INTERVAL_OPTION_STRING);
+ return -1;
+ }
+
+ *n = value;
+
+ return 0;
+}
+
+static int
+_rr_options_read_only_subvolumes_validate (const char *value_string,
+ char ***volume_list,
+ uint64_t *volume_count)
+{
+ char **vlist = NULL;
+ int vcount = 0;
+ int i = 0;
+
+ if (value_string == NULL || volume_list == NULL || volume_count)
+ {
+ return -1;
+ }
+
+ if (gf_strsplit (value_string,
+ ", ",
+ &vlist,
+ &vcount) != 0)
+ {
+ gf_log ("rr",
+ GF_LOG_ERROR,
+ "invalid subvolume list [%s] of option [%s]",
+ value_string,
+ RR_READ_ONLY_SUBVOLUMES_OPTION_STRING);
+ return -1;
+ }
+
+ for (i = 0; i < vcount; i++)
+ {
+ if (gf_volume_name_validate (vlist[i]) != 0)
+ {
+ gf_log ("rr",
+ GF_LOG_ERROR,
+ "invalid subvolume name [%s] in [%s] of option [%s]",
+ vlist[i],
+ value_string,
+ RR_READ_ONLY_SUBVOLUMES_OPTION_STRING);
+ goto free_exit;
+ }
+ }
+
+ *volume_list = vlist;
+ *volume_count = vcount;
+
+ return 0;
+
+ free_exit:
+ for (i = 0; i < vcount; i++)
+ {
+ free (vlist[i]);
+ }
+ free (vlist);
+
+ return -1;
+}
+
+int
+rr_options_validate (dict_t *options, rr_options_t *rr_options)
+{
+ char *value_string = NULL;
+
+ if (options == NULL || rr_options == NULL)
+ {
+ return -1;
+ }
+
+ if (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING))
+ if (data_to_str (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)))
+ value_string = data_to_str (dict_get (options,
+ RR_LIMITS_MIN_FREE_DISK_OPTION_STRING));
+ if (value_string != NULL)
+ {
+ if (_rr_options_min_free_disk_validate (value_string,
+ &rr_options->min_free_disk) != 0)
+ {
+ return -1;
+ }
+
+ gf_log ("rr",
+ GF_LOG_WARNING,
+ "using %s = %d",
+ RR_LIMITS_MIN_FREE_DISK_OPTION_STRING,
+ rr_options->min_free_disk);
+ }
+ else
+ {
+ rr_options->min_free_disk = RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT;
+
+ gf_log ("rr", GF_LOG_DEBUG,
+ "using %s = %d [default]",
+ RR_LIMITS_MIN_FREE_DISK_OPTION_STRING,
+ rr_options->min_free_disk);
+ }
+
+ value_string = NULL;
+ if (dict_get (options, RR_REFRESH_INTERVAL_OPTION_STRING))
+ value_string = data_to_str (dict_get (options,
+ RR_REFRESH_INTERVAL_OPTION_STRING));
+ if (value_string != NULL)
+ {
+ if (_rr_options_refresh_interval_validate (value_string,
+ &rr_options->refresh_interval) != 0)
+ {
+ return -1;
+ }
+
+ gf_log ("rr",
+ GF_LOG_WARNING,
+ "using %s = %d",
+ RR_REFRESH_INTERVAL_OPTION_STRING,
+ rr_options->refresh_interval);
+ }
+ else
+ {
+ rr_options->refresh_interval = RR_REFRESH_INTERVAL_VALUE_DEFAULT;
+
+ gf_log ("rr", GF_LOG_DEBUG,
+ "using %s = %d [default]",
+ RR_REFRESH_INTERVAL_OPTION_STRING,
+ rr_options->refresh_interval);
+ }
+
+ value_string = NULL;
+ if (dict_get (options, RR_READ_ONLY_SUBVOLUMES_OPTION_STRING))
+ value_string = data_to_str (dict_get (options,
+ RR_READ_ONLY_SUBVOLUMES_OPTION_STRING));
+ if (value_string != NULL)
+ {
+ if (_rr_options_read_only_subvolumes_validate (value_string,
+ &rr_options->read_only_subvolume_list,
+ &rr_options->read_only_subvolume_count) != 0)
+ {
+ return -1;
+ }
+
+ gf_log ("rr",
+ GF_LOG_WARNING,
+ "using %s = [%s]",
+ RR_READ_ONLY_SUBVOLUMES_OPTION_STRING,
+ value_string);
+ }
+
+ return 0;
+}
+
+struct volume_options options[] = {
+ { .key = { "scheduler.refresh-interval",
+ "rr.refresh-interval" },
+ .type = GF_OPTION_TYPE_TIME
+ },
+ { .key = { "scheduler.limits.min-free-disk",
+ "rr.limits.min-free-disk" },
+ .type = GF_OPTION_TYPE_PERCENT
+ },
+ { .key = { "scheduler.read-only-subvolumes",
+ "rr.read-only-subvolumes" },
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = {NULL} }
+};
diff --git a/scheduler/rr/src/rr-options.h b/scheduler/rr/src/rr-options.h
new file mode 100644
index 00000000000..4818c7d491c
--- /dev/null
+++ b/scheduler/rr/src/rr-options.h
@@ -0,0 +1,34 @@
+/*
+ Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RR_OPTIONS_H
+#define _RR_OPTIONS_H
+
+struct rr_options
+{
+ uint32_t min_free_disk;
+ uint32_t refresh_interval;
+ char **read_only_subvolume_list;
+ uint64_t read_only_subvolume_count;
+};
+typedef struct rr_options rr_options_t;
+
+int rr_options_validate (dict_t *options, rr_options_t *rr_options);
+
+#endif
diff --git a/scheduler/rr/src/rr.c b/scheduler/rr/src/rr.c
new file mode 100644
index 00000000000..3e54ff5e1b6
--- /dev/null
+++ b/scheduler/rr/src/rr.c
@@ -0,0 +1,565 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+#include <stdlib.h>
+
+#include <stdint.h>
+
+#include "scheduler.h"
+
+#include "rr-options.h"
+#include "rr.h"
+
+#define RR_MIN_FREE_DISK_NOT_REACHED 0
+#define RR_MIN_FREE_DISK_REACHED 1
+
+#define RR_SUBVOLUME_OFFLINE 0
+#define RR_SUBVOLUME_ONLINE 1
+
+#define LOG_ERROR(args...) gf_log ("rr", GF_LOG_ERROR, ##args)
+#define LOG_WARNING(args...) gf_log ("rr", GF_LOG_WARNING, ##args)
+#define LOG_CRITICAL(args...) gf_log ("rr", GF_LOG_CRITICAL, ##args)
+
+#define ROUND_ROBIN(index, count) ((index + 1) % count)
+
+static int
+_cleanup_rr (rr_t *rr)
+{
+ int i;
+
+ if (rr == NULL)
+ {
+ return -1;
+ }
+
+ if (rr->options.read_only_subvolume_list != NULL)
+ {
+ for (i = 0; i < rr->options.read_only_subvolume_count; i++)
+ {
+ free (rr->options.read_only_subvolume_list[i]);
+ }
+ free (rr->options.read_only_subvolume_list);
+ }
+
+ free (rr->subvolume_list);
+
+ free (rr);
+
+ return 0;
+}
+
+int
+rr_init (xlator_t *this_xl)
+{
+ rr_t *rr = NULL;
+ dict_t *options = NULL;
+ xlator_list_t *children = NULL;
+ uint64_t children_count = 0;
+ int i = 0;
+ int j = 0;
+
+ if (this_xl == NULL)
+ {
+ return -1;
+ }
+
+ if ((options = this_xl->options) == NULL)
+ {
+ return -1;
+ }
+
+ if ((children = this_xl->children) == NULL)
+ {
+ return -1;
+ }
+
+ if ((rr = CALLOC (1, sizeof (rr_t))) == NULL)
+ {
+ return -1;
+ }
+
+ if (rr_options_validate (options, &rr->options) != 0)
+ {
+ free (rr);
+ return -1;
+ }
+
+ for (i = 0; i < rr->options.read_only_subvolume_count; i++)
+ {
+ char found = 0;
+
+ for (children = this_xl->children;
+ children != NULL;
+ children = children->next)
+ {
+ if (strcmp (rr->options.read_only_subvolume_list[i],
+ children->xlator->name) == 0)
+ {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ LOG_ERROR ("read-only subvolume [%s] not found in volume list",
+ rr->options.read_only_subvolume_list[i]);
+ _cleanup_rr (rr);
+ return -1;
+ }
+ }
+
+ for (children = this_xl->children;
+ children != NULL;
+ children = children->next)
+ {
+ children_count++;
+ }
+
+ /* bala: excluding read_only_subvolumes */
+ if ((rr->subvolume_count = children_count -
+ rr->options.read_only_subvolume_count) == 0)
+ {
+ LOG_ERROR ("no writable volumes found for scheduling");
+ _cleanup_rr (rr);
+ return -1;
+ }
+
+ if ((rr->subvolume_list = CALLOC (rr->subvolume_count,
+ sizeof (rr_subvolume_t))) == NULL)
+ {
+ _cleanup_rr (rr);
+ return -1;
+ }
+
+ i = 0;
+ j = 0;
+ for (children = this_xl->children;
+ children != NULL;
+ children = children->next)
+ {
+ char found = 0;
+
+ for (j = 0; j < rr->options.read_only_subvolume_count; j++)
+ {
+ if (strcmp (rr->options.read_only_subvolume_list[i],
+ children->xlator->name) == 0)
+ {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ rr_subvolume_t *subvolume = NULL;
+
+ subvolume = &rr->subvolume_list[i];
+
+ subvolume->xl = children->xlator;
+ subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED;
+ subvolume->status = RR_SUBVOLUME_ONLINE;
+
+ i++;
+ }
+ }
+
+ rr->schedule_index = UINT64_MAX;
+ rr->last_stat_fetched_time.tv_sec = 0;
+ rr->last_stat_fetched_time.tv_usec = 0;
+ pthread_mutex_init (&rr->mutex, NULL);
+
+ *((long *)this_xl->private) = (long)rr;
+
+ return 0;
+}
+
+void
+rr_fini (xlator_t *this_xl)
+{
+ rr_t *rr = NULL;
+
+ if (this_xl == NULL)
+ {
+ return;
+ }
+
+ if ((rr = (rr_t *) *((long *)this_xl->private)) != NULL)
+ {
+ pthread_mutex_destroy (&rr->mutex);
+ _cleanup_rr (rr);
+ this_xl->private = NULL;
+ }
+
+ return;
+}
+
+xlator_t *
+rr_schedule (xlator_t *this_xl, const void *path)
+{
+ rr_t *rr = NULL;
+ uint64_t next_schedule_index = 0;
+ int i = 0;
+
+ if (this_xl == NULL || path == NULL)
+ {
+ return NULL;
+ }
+
+ rr = (rr_t *) *((long *)this_xl->private);
+ next_schedule_index = ROUND_ROBIN (rr->schedule_index,
+ rr->subvolume_count);
+
+ rr_update (this_xl);
+
+ for (i = next_schedule_index; i < rr->subvolume_count; i++)
+ {
+ if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE &&
+ rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ rr->schedule_index = i;
+ pthread_mutex_unlock (&rr->mutex);
+ return rr->subvolume_list[i].xl;
+ }
+ }
+
+ for (i = 0; i < next_schedule_index; i++)
+ {
+ if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE &&
+ rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ rr->schedule_index = i;
+ pthread_mutex_unlock (&rr->mutex);
+ return rr->subvolume_list[i].xl;
+ }
+ }
+
+ for (i = next_schedule_index; i < rr->subvolume_count; i++)
+ {
+ if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ rr->schedule_index = i;
+ pthread_mutex_unlock (&rr->mutex);
+ return rr->subvolume_list[i].xl;
+ }
+ }
+
+ for (i = 0; i < next_schedule_index; i++)
+ {
+ if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ rr->schedule_index = i;
+ pthread_mutex_unlock (&rr->mutex);
+ return rr->subvolume_list[i].xl;
+ }
+ }
+
+ return NULL;
+}
+
+void
+rr_update (xlator_t *this_xl)
+{
+ rr_t *rr = NULL;
+ struct timeval ctime = {0, 0};
+ int i = 0;
+
+ if (this_xl == NULL)
+ {
+ return ;
+ }
+
+ if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+ {
+ return ;
+ }
+
+ if (gettimeofday (&ctime, NULL) != 0)
+ {
+ return ;
+ }
+
+ if (ctime.tv_sec > (rr->options.refresh_interval +
+ rr->last_stat_fetched_time.tv_sec))
+ {
+ pthread_mutex_lock (&rr->mutex);
+ rr->last_stat_fetched_time = ctime;
+ pthread_mutex_unlock (&rr->mutex);
+
+ for (i = 0; i < rr->subvolume_count; i++)
+ {
+ xlator_t *subvolume_xl = NULL;
+ call_frame_t *frame = NULL;
+ call_pool_t *pool = NULL;
+
+ subvolume_xl = rr->subvolume_list[i].xl;
+
+ pool = this_xl->ctx->pool;
+
+ frame = create_frame (this_xl, pool);
+
+ STACK_WIND_COOKIE (frame,
+ rr_update_cbk,
+ subvolume_xl->name,
+ subvolume_xl,
+ subvolume_xl->mops->stats,
+ 0);
+ }
+ }
+
+ return ;
+}
+
+int
+rr_update_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this_xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *stats)
+{
+ rr_t *rr = NULL;
+ rr_subvolume_t *subvolume = NULL;
+ uint8_t free_disk_percent = 0;
+ int i = 0;
+
+ if (frame == NULL)
+ {
+ return -1;
+ }
+
+ if (cookie == NULL || this_xl == NULL)
+ {
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if (op_ret == 0 && stats == NULL)
+ {
+ LOG_CRITICAL ("fatal! op_ret is 0 and stats is NULL. "
+ "Please report this to <gluster-devel@nongnu.org>");
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+ {
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ for (i = 0; i < rr->subvolume_count; i++)
+ {
+ if (rr->subvolume_list[i].xl->name == (char *) cookie)
+ {
+ subvolume = &rr->subvolume_list[i];
+ break;
+ }
+ }
+
+ if (subvolume == NULL)
+ {
+ LOG_ERROR ("unknown cookie [%s]", (char *) cookie);
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if (op_ret == 0)
+ {
+ free_disk_percent = (stats->free_disk * 100) / stats->total_disk_size;
+ if (free_disk_percent > rr->options.min_free_disk)
+ {
+ if (subvolume->free_disk_status != RR_MIN_FREE_DISK_NOT_REACHED)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED;
+ pthread_mutex_unlock (&rr->mutex);
+ LOG_WARNING ("subvolume [%s] is available with free space for scheduling",
+ subvolume->xl->name);
+ }
+ }
+ else
+ {
+ if (subvolume->free_disk_status != RR_MIN_FREE_DISK_REACHED)
+ {
+ pthread_mutex_lock (&rr->mutex);
+ subvolume->free_disk_status = RR_MIN_FREE_DISK_REACHED;
+ pthread_mutex_unlock (&rr->mutex);
+ LOG_WARNING ("subvolume [%s] reached minimum disk space requirement",
+ subvolume->xl->name);
+ }
+ }
+ }
+ else
+ {
+ pthread_mutex_lock (&rr->mutex);
+ subvolume->status = RR_SUBVOLUME_OFFLINE;
+ pthread_mutex_unlock (&rr->mutex);
+ LOG_ERROR ("unable to get subvolume [%s] status information and "
+ "scheduling is disabled",
+ subvolume->xl->name);
+ }
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+void
+rr_notify (xlator_t *this_xl, int32_t event, void *data)
+{
+ rr_t *rr = NULL;
+ rr_subvolume_t *subvolume = NULL;
+ xlator_t *subvolume_xl = NULL;
+ int i = 0, ret = 0;
+ call_frame_t *frame = NULL;
+ call_pool_t *pool = NULL;
+ dict_t *xattr = get_new_dict ();
+ int32_t version[1] = {1};
+
+ if (this_xl == NULL || data == NULL) {
+ return ;
+ }
+
+ if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) {
+ return ;
+ }
+
+ subvolume_xl = (xlator_t *) data;
+
+ for (i = 0; i < rr->subvolume_count; i++) {
+ if (rr->subvolume_list[i].xl == subvolume_xl) {
+ subvolume = &rr->subvolume_list[i];
+ break;
+ }
+ }
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ /* Seeding, to be done only once */
+ if (rr->first_time && (i == rr->subvolume_count)) {
+ loc_t loc = {0,};
+ xlator_t *trav = NULL;
+
+ pool = this_xl->ctx->pool;
+ frame = create_frame (this_xl, pool);
+ ret = dict_set_bin (xattr, "trusted.glusterfs.scheduler.rr",
+ version, sizeof (int32_t));
+ if (-1 == ret) {
+ gf_log (this_xl->name, GF_LOG_ERROR, "rr seed setting failed");
+ }
+ if (xattr)
+ dict_ref (xattr);
+
+ loc.path = strdup ("/");
+ for (trav = this_xl->parents->xlator; trav; trav = trav->parents->xlator) {
+ if (trav->itable) {
+ loc.inode = trav->itable->root;
+ break;
+ }
+ }
+ STACK_WIND (frame,
+ rr_notify_cbk,
+ (xlator_t *)data,
+ ((xlator_t *)data)->fops->xattrop,
+ &loc,
+ GF_XATTROP_ADD_ARRAY,
+ xattr);
+
+ if (xattr)
+ dict_unref (xattr);
+
+ rr->first_time = 0;
+ }
+ if (subvolume) {
+ pthread_mutex_lock (&rr->mutex);
+ subvolume->status = RR_SUBVOLUME_ONLINE;
+ pthread_mutex_unlock (&rr->mutex);
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ if (subvolume) {
+ pthread_mutex_lock (&rr->mutex);
+ subvolume->status = RR_SUBVOLUME_OFFLINE;
+ pthread_mutex_unlock (&rr->mutex);
+ }
+ break;
+ }
+
+ return ;
+}
+
+int
+rr_notify_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this_xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xattr)
+{
+ rr_t *rr = NULL;
+ int32_t *index = NULL;
+ int32_t ret = -1;
+ void *tmp_index_ptr = NULL;
+
+ if (frame == NULL)
+ {
+ return -1;
+ }
+
+ if ((this_xl == NULL) || (op_ret == -1))
+ {
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+ {
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ ret = dict_get_bin (xattr, "trusted.glusterfs.scheduler.rr", &tmp_index_ptr);
+ index = tmp_index_ptr;
+ if (ret == 0)
+ rr->schedule_index = (index[0] % rr->subvolume_count);
+ else
+ rr->schedule_index = 0;
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+struct sched_ops sched = {
+ .init = rr_init,
+ .fini = rr_fini,
+ .update = rr_update,
+ .schedule = rr_schedule,
+ .notify = rr_notify
+};
+
diff --git a/scheduler/rr/src/rr.h b/scheduler/rr/src/rr.h
new file mode 100644
index 00000000000..baa471209e8
--- /dev/null
+++ b/scheduler/rr/src/rr.h
@@ -0,0 +1,70 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RR_H
+#define _RR_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+#include <stdint.h>
+#include <sys/time.h>
+
+struct rr_subvolume
+{
+ xlator_t *xl;
+ uint8_t free_disk_status;
+ uint8_t status;
+};
+typedef struct rr_subvolume rr_subvolume_t;
+
+struct rr
+{
+ rr_options_t options;
+ rr_subvolume_t *subvolume_list;
+ uint64_t subvolume_count;
+ uint64_t schedule_index;
+ struct timeval last_stat_fetched_time;
+ pthread_mutex_t mutex;
+ char first_time;
+};
+typedef struct rr rr_t;
+
+int rr_init (xlator_t *this_xl);
+void rr_fini (xlator_t *this_xl);
+xlator_t *rr_schedule (xlator_t *this_xl, const void *path);
+void rr_update (xlator_t *this_xl);
+int rr_update_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this_xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct xlator_stats *stats);
+void rr_notify (xlator_t *this_xl, int32_t event, void *data);
+int rr_notify_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this_xl,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xattr);
+
+#endif /* _RR_H */
diff --git a/scheduler/switch/Makefile.am b/scheduler/switch/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/scheduler/switch/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/scheduler/switch/src/Makefile.am b/scheduler/switch/src/Makefile.am
new file mode 100644
index 00000000000..dc7d16d40d2
--- /dev/null
+++ b/scheduler/switch/src/Makefile.am
@@ -0,0 +1,12 @@
+sched_LTLIBRARIES = switch.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+switch_la_LDFLAGS = -module -avoidversion
+
+switch_la_SOURCES = switch.c
+switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/scheduler/switch/src/switch.c b/scheduler/switch/src/switch.c
new file mode 100644
index 00000000000..70b3071871d
--- /dev/null
+++ b/scheduler/switch/src/switch.c
@@ -0,0 +1,398 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <fnmatch.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "scheduler.h"
+
+struct switch_sched_array {
+ xlator_t *xl;
+ int32_t eligible;
+ int32_t considered;
+};
+
+/* Select one of this struct based on the path's pattern match */
+struct switch_sched_struct {
+ struct switch_sched_struct *next;
+ struct switch_sched_array *array;
+ char path_pattern[256];
+ int32_t node_index; /* Index of the node in
+ this pattern. */
+ int32_t num_child; /* Total num of child nodes
+ with this pattern. */
+};
+
+struct switch_struct {
+ struct switch_sched_struct *cond;
+ struct switch_sched_array *array;
+ pthread_mutex_t switch_mutex;
+ int32_t child_count;
+};
+
+/* This function should return child node as '*:subvolumes' is inserterd */
+static xlator_t *
+switch_get_matching_xl (const char *path, struct switch_sched_struct *cond)
+{
+ struct switch_sched_struct *trav = cond;
+ char *pathname = strdup (path);
+ int index = 0;
+
+ while (trav) {
+ if (fnmatch (trav->path_pattern,
+ pathname, FNM_NOESCAPE) == 0) {
+ free (pathname);
+ trav->node_index %= trav->num_child;
+ index = (trav->node_index++) % trav->num_child;
+ return trav->array[index].xl;
+ }
+ trav = trav->next;
+ }
+ free (pathname);
+ return NULL;
+}
+
+
+static int32_t
+switch_init (xlator_t *xl)
+{
+ int32_t index = 0;
+ data_t *data = NULL;
+ char *child = NULL;
+ char *tmp = NULL;
+ char *childs_data = NULL;
+ xlator_list_t *trav_xl = xl->children;
+ struct switch_struct *switch_buf = NULL;
+
+ switch_buf = CALLOC (1, sizeof (struct switch_struct));
+ ERR_ABORT (switch_buf);
+
+ while (trav_xl) {
+ index++;
+ trav_xl = trav_xl->next;
+ }
+ switch_buf->child_count = index;
+ switch_buf->array = CALLOC (index + 1,
+ sizeof (struct switch_sched_struct));
+ ERR_ABORT (switch_buf->array);
+ trav_xl = xl->children;
+ index = 0;
+
+ while (trav_xl) {
+ switch_buf->array[index].xl = trav_xl->xlator;
+ switch_buf->array[index].eligible = 1;
+ trav_xl = trav_xl->next;
+ index++;
+ }
+
+ data = dict_get (xl->options, "scheduler.read-only-subvolumes");
+ if (data) {
+ childs_data = strdup (data->data);
+ child = strtok_r (childs_data, ",", &tmp);
+ while (child) {
+ for (index = 1;
+ index < switch_buf->child_count; index++) {
+ if (strcmp (switch_buf->array[index - 1].xl->name, child) == 0) {
+ gf_log ("switch", GF_LOG_DEBUG,
+ "Child '%s' is read-only",
+ child);
+ memcpy (&(switch_buf->array[index-1]),
+ &(switch_buf->array[switch_buf->child_count - 1]),
+ sizeof (struct switch_sched_array));
+ switch_buf->child_count--;
+ break;
+ }
+ }
+ child = strtok_r (NULL, ",", &tmp);
+ }
+ free (childs_data);
+ }
+
+ data = dict_get (xl->options, "scheduler.local-volume-name");
+ if (data) {
+ /* Means, give preference to that node first */
+ gf_log ("switch", GF_LOG_DEBUG,
+ "local volume defined as %s", data->data);
+
+ /* TODO: parse it properly, have an extra index to
+ specify that first */
+ }
+
+ /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */
+ data = dict_get (xl->options, "scheduler.switch.case");
+ if (data) {
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *switch_str = NULL;
+ char *pattern = NULL;
+ char *childs = NULL;
+ struct switch_sched_struct *switch_opt = NULL;
+ struct switch_sched_struct *trav = NULL;
+ /* Get the pattern for considering switch case.
+ "option block-size *avi:10MB" etc */
+ switch_str = strtok_r (data->data, ";", &tmp_str);
+ while (switch_str) {
+ dup_str = strdup (switch_str);
+ switch_opt =
+ CALLOC (1,
+ sizeof (struct switch_sched_struct));
+ ERR_ABORT (switch_opt);
+
+ /* Link it to the main structure */
+ if (switch_buf->cond) {
+ /* there are already few entries */
+ trav = switch_buf->cond;
+ while (trav->next)
+ trav = trav->next;
+ trav->next = switch_opt;
+ } else {
+ /* First entry */
+ switch_buf->cond = switch_opt;
+ }
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ childs = strtok_r (NULL, ":", &tmp_str1);
+ if (strncmp (pattern, "*", 2) == 0) {
+ gf_log ("switch", GF_LOG_WARNING,
+ "'*' pattern will be taken by default "
+ "for all the unconfigured child nodes,"
+ " hence neglecting current option");
+ switch_str = strtok_r (NULL, ";", &tmp_str);
+ free (dup_str);
+ continue;
+ }
+ memcpy (switch_opt->path_pattern,
+ pattern, strlen (pattern));
+ if (childs) {
+ int32_t idx = 0;
+ char *tmp1 = NULL;
+ char *dup_childs = NULL;
+ /* TODO: get the list of child nodes for
+ the given pattern */
+ dup_childs = strdup (childs);
+ child = strtok_r (dup_childs, ",", &tmp);
+ while (child) {
+ idx++;
+ child = strtok_r (NULL, ",", &tmp);
+ }
+ free (dup_childs);
+ child = strtok_r (childs, ",", &tmp1);
+ switch_opt->num_child = idx;
+ switch_opt->array =
+ CALLOC (1, idx * sizeof (struct switch_sched_array));
+ ERR_ABORT (switch_opt->array);
+ idx = 0;
+ child = strtok_r (childs, ",", &tmp);
+ while (child) {
+ for (index = 1;
+ index < switch_buf->child_count;
+ index++) {
+ if (strcmp (switch_buf->array[index - 1].xl->name,
+ child) == 0) {
+ gf_log ("switch",
+ GF_LOG_DEBUG,
+ "'%s' pattern will be scheduled to \"%s\"",
+ switch_opt->path_pattern, child);
+ /*
+ if (switch_buf->array[index-1].considered) {
+ gf_log ("switch", GF_LOG_DEBUG,
+ "ambiguity found, exiting");
+ return -1;
+ }
+ */
+ switch_opt->array[idx].xl = switch_buf->array[index-1].xl;
+ switch_buf->array[index-1].considered = 1;
+ idx++;
+ break;
+ }
+ }
+ child = strtok_r (NULL, ",", &tmp1);
+ }
+ } else {
+ /* error */
+ gf_log ("switch", GF_LOG_ERROR,
+ "Check \"scheduler.switch.case\" "
+ "option in unify volume. Exiting");
+ free (switch_buf->array);
+ free (switch_buf);
+ return -1;
+ }
+ free (dup_str);
+ switch_str = strtok_r (NULL, ";", &tmp_str);
+ }
+ }
+ /* Now, all the pattern based considerations done, so for all the
+ * remaining pattern, '*' to all the remaining child nodes
+ */
+ {
+ struct switch_sched_struct *switch_opt = NULL;
+ int32_t flag = 0;
+ int32_t index = 0;
+ for (index=0; index < switch_buf->child_count; index++) {
+ /* check for considered flag */
+ if (switch_buf->array[index].considered)
+ continue;
+ flag++;
+ }
+ if (!flag) {
+ gf_log ("switch", GF_LOG_ERROR,
+ "No nodes left for pattern '*'. Exiting.");
+ return -1;
+ }
+ switch_opt = CALLOC (1, sizeof (struct switch_sched_struct));
+ ERR_ABORT (switch_opt);
+ if (switch_buf->cond) {
+ /* there are already few entries */
+ struct switch_sched_struct *trav = switch_buf->cond;
+ while (trav->next)
+ trav = trav->next;
+ trav->next = switch_opt;
+ } else {
+ /* First entry */
+ switch_buf->cond = switch_opt;
+ }
+ /* Add the '*' pattern to the array */
+ memcpy (switch_opt->path_pattern, "*", 2);
+ switch_opt->num_child = flag;
+ switch_opt->array =
+ CALLOC (1, flag * sizeof (struct switch_sched_array));
+ ERR_ABORT (switch_opt->array);
+ flag = 0;
+ for (index=0; index < switch_buf->child_count; index++) {
+ /* check for considered flag */
+ if (switch_buf->array[index].considered)
+ continue;
+ gf_log ("switch", GF_LOG_DEBUG,
+ "'%s' pattern will be scheduled to \"%s\"",
+ switch_opt->path_pattern,
+ switch_buf->array[index].xl->name);
+ switch_opt->array[flag].xl =
+ switch_buf->array[index].xl;
+ switch_buf->array[index].considered = 1;
+ flag++;
+ }
+ }
+
+ pthread_mutex_init (&switch_buf->switch_mutex, NULL);
+
+ // put it at the proper place
+ *((long *)xl->private) = (long)switch_buf;
+
+ return 0;
+}
+
+static void
+switch_fini (xlator_t *xl)
+{
+ /* TODO: free all the allocated entries */
+ struct switch_struct *switch_buf = NULL;
+ switch_buf = (struct switch_struct *)*((long *)xl->private);
+
+ pthread_mutex_destroy (&switch_buf->switch_mutex);
+ free (switch_buf->array);
+ free (switch_buf);
+}
+
+static xlator_t *
+switch_schedule (xlator_t *xl, const void *path)
+{
+ struct switch_struct *switch_buf = NULL;
+ switch_buf = (struct switch_struct *)*((long *)xl->private);
+
+ return switch_get_matching_xl (path, switch_buf->cond);
+}
+
+
+/**
+ * notify
+ */
+void
+switch_notify (xlator_t *xl, int32_t event, void *data)
+{
+ /* TODO: This should be checking in switch_sched_struct */
+#if 0
+ struct switch_struct *switch_buf = NULL;
+ int32_t idx = 0;
+
+ switch_buf = (struct switch_struct *)*((long *)xl->private);
+ if (!switch_buf)
+ return;
+
+ for (idx = 0; idx < switch_buf->child_count; idx++) {
+ if (switch_buf->array[idx].xl == (xlator_t *)data)
+ break;
+ }
+
+ switch (event)
+ {
+ case GF_EVENT_CHILD_UP:
+ {
+ switch_buf->array[idx].eligible = 1;
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ {
+ switch_buf->array[idx].eligible = 0;
+ }
+ break;
+ default:
+ {
+ ;
+ }
+ break;
+ }
+#endif
+}
+
+static void
+switch_update (xlator_t *xl)
+{
+ return;
+}
+
+struct sched_ops sched = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .update = switch_update,
+ .schedule = switch_schedule,
+ .notify = switch_notify
+};
+
+struct volume_options options[] = {
+ { .key = { "scheduler.read-only-subvolumes" ,
+ "switch.read-only-subvolumes"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = { "scheduler.local-volume-name",
+ "switch.nufa.local-volume-name" },
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+ { .key = { "scheduler.switch.case",
+ "switch.case" },
+ .type = GF_OPTION_TYPE_ANY
+ },
+ { .key = {NULL} }
+};