Added all files

author: Vikas Gorur <vikas@zresearch.com> 2009-02-18 17:36:07 +0530
committer: Vikas Gorur <vikas@zresearch.com> 2009-02-18 17:36:07 +0530
commit: 77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree: 02e155a5753b398ee572b45793f889b538efab6b /scheduler
parent: f3b2e6580e5663292ee113c741343c8a43ee133f (diff)
21 files changed, 3220 insertions, 0 deletions
diff --git a/scheduler/Makefile.am b/scheduler/Makefile.am
new file mode 100644
index 000000000..618fa7dd8
--- /dev/null
+++ b/scheduler/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = alu random nufa rr switch
+
+CLEANFILES = 
diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/scheduler/alu/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES = 
diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am
new file mode 100644
index 000000000..eb7d0db07
--- /dev/null
+++ b/scheduler/alu/src/Makefile.am
@@ -0,0 +1,14 @@
+sched_LTLIBRARIES = alu.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+alu_la_LDFLAGS = -module -avoidversion
+
+alu_la_SOURCES = alu.c
+alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = alu.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c
new file mode 100644
index 000000000..754c5e353
--- /dev/null
+++ b/scheduler/alu/src/alu.c
@@ -0,0 +1,993 @@
+/*
+  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/* ALU code needs a complete re-write. This is one of the most important 
+ * part of GlusterFS and so needs more and more reviews and testing 
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+#include <stdint.h>
+#include "stack.h"
+#include "alu.h"
+
+#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT         (1 * GF_UNIT_GB)
+#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT          (512 * GF_UNIT_MB)
+
+#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT        25
+#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT         5
+
+#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT         25
+#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT          5
+
+#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT   1000
+#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT    100
+
+#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT             100
+
+#define ALU_REFRESH_INTERVAL_DEFAULT                   5
+#define ALU_REFRESH_CREATE_COUNT_DEFAULT               5
+
+
+static int64_t 
+get_stats_disk_usage (struct xlator_stats *this)
+{
+	return this->disk_usage;
+}
+
+static int64_t 
+get_stats_write_usage (struct xlator_stats *this)
+{
+	return this->write_usage;
+}
+
+static int64_t 
+get_stats_read_usage (struct xlator_stats *this)
+{
+	return this->read_usage;
+}
+
+static int64_t 
+get_stats_disk_speed (struct xlator_stats *this)
+{
+	return this->disk_speed;
+}
+
+static int64_t 
+get_stats_file_usage (struct xlator_stats *this)
+{
+	/* Avoid warning "defined but not used" */
+	(void) &get_stats_file_usage;    
+
+	return this->nr_files;
+}
+
+static int64_t 
+get_stats_free_disk (struct xlator_stats *this)
+{
+	if (this->total_disk_size > 0)
+		return (this->free_disk * 100) / this->total_disk_size;
+	return 0;
+}
+
+static int64_t 
+get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+	return (max->write_usage - min->write_usage);
+}
+
+static int64_t 
+get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+	return (max->read_usage - min->read_usage);
+}
+
+static int64_t 
+get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+	return (max->disk_usage - min->disk_usage);
+}
+
+static int64_t 
+get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min)
+{
+	return (max->disk_speed - min->disk_speed);
+}
+
+static int64_t 
+get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min)
+{
+	return (max->nr_files - min->nr_files);
+}
+
+
+int 
+alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched)
+{
+	data_t *order = dict_get (xl->options, "scheduler.alu.order");
+	if (!order) {
+		gf_log (xl->name, GF_LOG_ERROR,
+			"option 'scheduler.alu.order' not specified");
+		return -1;
+	}
+	struct alu_threshold *_threshold_fn;
+	struct alu_threshold *tmp_threshold;
+	data_t *entry_fn = NULL;
+	data_t *exit_fn = NULL;
+	char *tmp_str = NULL;
+	char *order_str = strtok_r (order->data, ":", &tmp_str);
+	/* Get the scheduling priority order, specified by the user. */
+	while (order_str) {
+		gf_log ("alu", GF_LOG_DEBUG,
+			"alu_init: order string: %s",
+			order_str);
+		if (strcmp (order_str, "disk-usage") == 0) {
+			/* Disk usage */
+			_threshold_fn = 
+				CALLOC (1, sizeof (struct alu_threshold));
+			ERR_ABORT (_threshold_fn);
+			_threshold_fn->diff_value = get_max_diff_disk_usage;
+			_threshold_fn->sched_value = get_stats_disk_usage;
+			entry_fn = 
+				dict_get (xl->options, 
+					  "scheduler.alu.disk-usage.entry-threshold");
+			if (entry_fn) {
+				if (gf_string2bytesize (entry_fn->data, 
+							&alu_sched->entry_limit.disk_usage) != 0) {
+					gf_log (xl->name, GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"disk-usage.entry-threshold\"",
+						entry_fn->data);
+					return -1;
+				}
+			} else {
+				alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->entry_value = get_stats_disk_usage;
+			exit_fn = dict_get (xl->options, 
+					    "scheduler.alu.disk-usage.exit-threshold");
+			if (exit_fn) {
+				if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0)	{
+					gf_log (xl->name, GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"disk-usage.exit-threshold\"", 
+						exit_fn->data);
+					return -1;
+				}
+			} else {
+				alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->exit_value = get_stats_disk_usage;
+			tmp_threshold = alu_sched->threshold_fn;
+			if (!tmp_threshold) {
+				alu_sched->threshold_fn = _threshold_fn;
+			} else {
+				while (tmp_threshold->next) {
+					tmp_threshold = tmp_threshold->next;
+				}
+				tmp_threshold->next = _threshold_fn;
+			}
+			gf_log ("alu",
+				GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"", 
+				alu_sched->entry_limit.disk_usage, 
+				alu_sched->exit_limit.disk_usage);
+			
+		} else if (strcmp (order_str, "write-usage") == 0) {
+			/* Handle "write-usage" */
+			
+			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+			ERR_ABORT (_threshold_fn);
+			_threshold_fn->diff_value = get_max_diff_write_usage;
+			_threshold_fn->sched_value = get_stats_write_usage;
+			entry_fn = dict_get (xl->options, 
+					     "scheduler.alu.write-usage.entry-threshold");
+			if (entry_fn) {
+				if (gf_string2bytesize (entry_fn->data, 
+							&alu_sched->entry_limit.write_usage) != 0) {
+					gf_log (xl->name, GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of option scheduler.alu."
+						"write-usage.entry-threshold", 
+						entry_fn->data);
+					return -1;
+				}
+			} else {
+				alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->entry_value = get_stats_write_usage;
+			exit_fn = dict_get (xl->options, 
+					    "scheduler.alu.write-usage.exit-threshold");
+			if (exit_fn) {
+				if (gf_string2bytesize (exit_fn->data, 
+							&alu_sched->exit_limit.write_usage) != 0) {
+					gf_log (xl->name, GF_LOG_ERROR, 
+						"invalid number format \"%s\""
+						" of \"option scheduler.alu."
+						"write-usage.exit-threshold\"",
+						exit_fn->data);
+					return -1;
+				}
+			} else {
+				alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->exit_value = get_stats_write_usage;
+			tmp_threshold = alu_sched->threshold_fn;
+			if (!tmp_threshold) {
+				alu_sched->threshold_fn = _threshold_fn;
+			} else {
+				while (tmp_threshold->next) {
+					tmp_threshold = tmp_threshold->next;
+				}
+				tmp_threshold->next = _threshold_fn;
+			}
+			gf_log (xl->name, GF_LOG_DEBUG, 
+				"alu_init: = %"PRId64",%"PRId64"", 
+				alu_sched->entry_limit.write_usage, 
+				alu_sched->exit_limit.write_usage);
+			
+		} else if (strcmp (order_str, "read-usage") == 0) {
+			/* Read usage */
+			
+			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+			ERR_ABORT (_threshold_fn);
+			_threshold_fn->diff_value = get_max_diff_read_usage;
+			_threshold_fn->sched_value = get_stats_read_usage;
+			entry_fn = dict_get (xl->options, 
+					     "scheduler.alu.read-usage.entry-threshold");
+			if (entry_fn) {
+				if (gf_string2bytesize (entry_fn->data, 
+							&alu_sched->entry_limit.read_usage) != 0) {
+					gf_log (xl->name, 
+						GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"read-usage.entry-threshold\"",
+						entry_fn->data);
+					return -1;
+				}
+			} else {
+				alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->entry_value = get_stats_read_usage;
+			exit_fn = dict_get (xl->options, 
+					    "scheduler.alu.read-usage.exit-threshold");
+			if (exit_fn)
+			{
+				if (gf_string2bytesize (exit_fn->data, 
+							&alu_sched->exit_limit.read_usage) != 0)
+				{
+					gf_log ("alu", GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"read-usage.exit-threshold\"", 
+						exit_fn->data);
+					return -1;
+				}
+			}
+			else
+			{
+				alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->exit_value = get_stats_read_usage;
+			tmp_threshold = alu_sched->threshold_fn;
+			if (!tmp_threshold) {
+				alu_sched->threshold_fn = _threshold_fn;
+			}
+			else {
+				while (tmp_threshold->next) {
+					tmp_threshold = tmp_threshold->next;
+				}
+				tmp_threshold->next = _threshold_fn;
+			}
+			gf_log ("alu", GF_LOG_DEBUG, 
+				"alu_init: = %"PRId64",%"PRId64"", 
+				alu_sched->entry_limit.read_usage, 
+				alu_sched->exit_limit.read_usage);
+			
+		} else if (strcmp (order_str, "open-files-usage") == 0) {
+			/* Open files counter */
+			
+			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+			ERR_ABORT (_threshold_fn);
+			_threshold_fn->diff_value = get_max_diff_file_usage;
+			_threshold_fn->sched_value = get_stats_file_usage;
+			entry_fn = dict_get (xl->options, 
+					     "scheduler.alu.open-files-usage.entry-threshold");
+			if (entry_fn) {
+				if (gf_string2uint64 (entry_fn->data, 
+						      &alu_sched->entry_limit.nr_files) != 0)
+				{
+					gf_log ("alu", GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"open-files-usage.entry-"
+						"threshold\"", entry_fn->data);
+					return -1;
+				}
+			}
+			else
+			{
+				alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->entry_value = get_stats_file_usage;
+			exit_fn = dict_get (xl->options, 
+					    "scheduler.alu.open-files-usage.exit-threshold");
+			if (exit_fn)
+			{
+				if (gf_string2uint64 (exit_fn->data, 
+						      &alu_sched->exit_limit.nr_files) != 0)
+				{
+					gf_log ("alu", GF_LOG_ERROR, 
+						"invalid number format \"%s\" "
+						"of \"option scheduler.alu."
+						"open-files-usage.exit-"
+						"threshold\"", exit_fn->data);
+					return -1;
+				}
+			}
+			else
+			{
+				alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT;
+			}
+			_threshold_fn->exit_value = get_stats_file_usage;
+			tmp_threshold = alu_sched->threshold_fn;
+			if (!tmp_threshold) {
+				alu_sched->threshold_fn = _threshold_fn;
+			}
+			else {
+				while (tmp_threshold->next) {
+					tmp_threshold = tmp_threshold->next;
+				}
+				tmp_threshold->next = _threshold_fn;
+			}
+			gf_log ("alu", GF_LOG_DEBUG, 
+				"alu.c->alu_init: = %"PRIu64",%"PRIu64"", 
+				alu_sched->entry_limit.nr_files, 
+				alu_sched->exit_limit.nr_files);
+			
+		} else if (strcmp (order_str, "disk-speed-usage") == 0) {
+			/* Disk speed */
+			
+			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold));
+			ERR_ABORT (_threshold_fn);
+			_threshold_fn->diff_value = get_max_diff_disk_speed;
+			_threshold_fn->sched_value = get_stats_disk_speed;
+			entry_fn = dict_get (xl->options, 
+					     "scheduler.alu.disk-speed-usage.entry-threshold");
+			if (entry_fn) {
+				gf_log ("alu", GF_LOG_DEBUG,
+					"entry-threshold is given, "
+					"value is constant");
+			}
+			_threshold_fn->entry_value = NULL;
+			exit_fn = dict_get (xl->options, 
+					    "scheduler.alu.disk-speed-usage.exit-threshold");
+			if (exit_fn) {
+				gf_log ("alu", GF_LOG_DEBUG,
+					"exit-threshold is given, "
+					"value is constant");
+			}
+			_threshold_fn->exit_value = NULL;
+			tmp_threshold = alu_sched->threshold_fn;
+			if (!tmp_threshold) {
+				alu_sched->threshold_fn = _threshold_fn;
+			}
+			else {
+				while (tmp_threshold->next) {
+					tmp_threshold = tmp_threshold->next;
+				}
+				tmp_threshold->next = _threshold_fn;
+			}
+			
+		} else {
+			gf_log ("alu", GF_LOG_DEBUG,
+				"%s, unknown option provided to scheduler",
+				order_str);
+		}
+		order_str = strtok_r (NULL, ":", &tmp_str);
+	}
+	
+	return 0;
+}
+
+static int32_t
+alu_init (xlator_t *xl)
+{
+	struct alu_sched *alu_sched = NULL;
+	struct alu_limits *_limit_fn = NULL;
+	struct alu_limits *tmp_limits = NULL;
+	uint32_t min_free_disk = 0;
+	data_t *limits = NULL;
+  
+	alu_sched = CALLOC (1, sizeof (struct alu_sched));
+	ERR_ABORT (alu_sched);
+
+	{
+		alu_parse_options (xl, alu_sched);
+	}
+
+	/* Get the limits */
+	
+	limits = dict_get (xl->options, 
+			   "scheduler.limits.min-free-disk");
+	if (limits) {
+		_limit_fn = CALLOC (1, sizeof (struct alu_limits));
+		ERR_ABORT (_limit_fn);
+		_limit_fn->min_value = get_stats_free_disk;
+		_limit_fn->cur_value = get_stats_free_disk;
+		tmp_limits = alu_sched->limits_fn ;
+		_limit_fn->next = tmp_limits;
+		alu_sched->limits_fn = _limit_fn;
+		
+		if (gf_string2percent (limits->data, 
+				       &min_free_disk) != 0) {
+			gf_log ("alu", GF_LOG_ERROR, 
+				"invalid number format \"%s\" "
+				"of \"option scheduler.limits."
+				"min-free-disk\"", limits->data);
+			return -1;
+		}
+		alu_sched->spec_limit.free_disk = min_free_disk;
+		
+		if (alu_sched->spec_limit.free_disk >= 100) {
+			gf_log ("alu", GF_LOG_ERROR,
+				"check the \"option scheduler."
+				"limits.min-free-disk\", it should "
+				"be percentage value");
+			return -1;
+		}
+		alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */
+		gf_log ("alu", GF_LOG_DEBUG,
+			"alu.limit.min-disk-free = %"PRId64"", 
+			_limit_fn->cur_value (&(alu_sched->spec_limit)));
+	}
+	
+	limits = dict_get (xl->options, 
+			   "scheduler.limits.max-open-files");
+	if (limits) {
+		// Update alu_sched->priority properly
+		_limit_fn = CALLOC (1, sizeof (struct alu_limits));
+		ERR_ABORT (_limit_fn);
+		_limit_fn->max_value = get_stats_file_usage;
+		_limit_fn->cur_value = get_stats_file_usage;
+		tmp_limits = alu_sched->limits_fn ;
+		_limit_fn->next = tmp_limits;
+		alu_sched->limits_fn = _limit_fn;
+		if (gf_string2uint64_base10 (limits->data, 
+					     &alu_sched->spec_limit.nr_files) != 0)
+		{
+			gf_log ("alu", GF_LOG_ERROR, 
+				"invalid number format '%s' of option "
+				"scheduler.limits.max-open-files", 
+				limits->data);
+			return -1;
+		}
+		
+		gf_log ("alu", GF_LOG_DEBUG,
+			"alu_init: limit.max-open-files = %"PRId64"",
+			_limit_fn->cur_value (&(alu_sched->spec_limit)));
+	}
+
+
+	/* Stats refresh options */
+	limits = dict_get (xl->options, 
+			   "scheduler.refresh-interval");
+	if (limits) {
+		if (gf_string2time (limits->data, 
+				    &alu_sched->refresh_interval) != 0)	{
+			gf_log ("alu", GF_LOG_ERROR, 
+				"invalid number format \"%s\" of "
+				"option scheduler.refresh-interval", 
+				limits->data);
+			return -1;
+		}
+	} else {
+		alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT;
+	}
+	gettimeofday (&(alu_sched->last_stat_fetch), NULL);
+	
+	
+	limits = dict_get (xl->options, 
+			   "scheduler.alu.stat-refresh.num-file-create");
+	if (limits) {
+		if (gf_string2uint32 (limits->data, 
+				      &alu_sched->refresh_create_count) != 0)
+		{
+			gf_log ("alu", GF_LOG_ERROR, 
+				"invalid number format \"%s\" of \"option "
+				"alu.stat-refresh.num-file-create\"", 
+				limits->data);
+			return -1;
+		}
+	} else {
+		alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT;
+	}
+
+	{
+		/* Build an array of child_nodes */
+		struct alu_sched_struct *sched_array = NULL;
+		xlator_list_t *trav_xl = xl->children;
+		data_t *data = NULL;
+		int32_t index = 0;
+		
+		while (trav_xl) {
+			index++;
+			trav_xl = trav_xl->next;
+		}
+		alu_sched->child_count = index;
+		sched_array = CALLOC (index, sizeof (struct alu_sched_struct));
+		ERR_ABORT (sched_array);
+		trav_xl = xl->children;
+		index = 0;
+		while (trav_xl) {
+			sched_array[index].xl = trav_xl->xlator;
+			sched_array[index].eligible = 1;
+			index++;
+			trav_xl = trav_xl->next;
+		}
+		alu_sched->array = sched_array;
+
+		data = dict_get (xl->options, 
+				 "scheduler.read-only-subvolumes");
+		if (data) {
+			char *child = NULL;
+			char *tmp = NULL;
+			char *childs_data = strdup (data->data);
+      
+			child = strtok_r (childs_data, ",", &tmp);
+			while (child) {
+				for (index = 1; index < alu_sched->child_count; index++) {
+					if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) {
+						memcpy (&(alu_sched->array[index -1]), 
+							&(alu_sched->array[alu_sched->child_count -1]), 
+							sizeof (struct alu_sched_struct));
+						alu_sched->child_count--;
+						break;
+					}
+				}
+				child = strtok_r (NULL, ",", &tmp);
+			}
+		}
+	}
+
+	*((long *)xl->private) = (long)alu_sched;
+
+	/* Initialize all the alu_sched structure's elements */
+	{
+		alu_sched->sched_nodes_pending = 0;
+
+		alu_sched->min_limit.free_disk = 0x00FFFFFF;
+		alu_sched->min_limit.disk_usage = 0xFFFFFFFF;
+		alu_sched->min_limit.total_disk_size = 0xFFFFFFFF;
+		alu_sched->min_limit.disk_speed = 0xFFFFFFFF;
+		alu_sched->min_limit.write_usage = 0xFFFFFFFF;
+		alu_sched->min_limit.read_usage = 0xFFFFFFFF;
+		alu_sched->min_limit.nr_files = 0xFFFFFFFF;
+		alu_sched->min_limit.nr_clients = 0xFFFFFFFF;
+	}
+
+	pthread_mutex_init (&alu_sched->alu_mutex, NULL);
+	return 0;
+}
+
+static void
+alu_fini (xlator_t *xl)
+{
+	if (!xl)
+		return;
+	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+	struct alu_limits *limit = alu_sched->limits_fn;
+	struct alu_threshold *threshold = alu_sched->threshold_fn;
+	void *tmp = NULL;
+	pthread_mutex_destroy (&alu_sched->alu_mutex);
+	free (alu_sched->array);
+	while (limit) {
+		tmp = limit;
+		limit = limit->next;
+		free (tmp);
+	}
+	while (threshold) {
+		tmp = threshold;
+		threshold = threshold->next;
+		free (tmp);
+	}
+	free (alu_sched);
+}
+
+static int32_t 
+update_stat_array_cbk (call_frame_t *frame,
+		       void *cookie,
+		       xlator_t *xl,
+		       int32_t op_ret,
+		       int32_t op_errno,
+		       struct xlator_stats *trav_stats)
+{
+	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+	struct alu_limits *limits_fn = alu_sched->limits_fn;
+	int32_t idx = 0;
+  
+	pthread_mutex_lock (&alu_sched->alu_mutex);
+	for (idx = 0; idx < alu_sched->child_count; idx++) {
+		if (alu_sched->array[idx].xl == (xlator_t *)cookie)
+			break;
+	}
+	pthread_mutex_unlock (&alu_sched->alu_mutex);
+
+	if (op_ret == -1) {
+		alu_sched->array[idx].eligible = 0;
+	} else {
+		memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats));
+    
+		/* Get stats from all the child node */
+		/* Here check the limits specified by the user to 
+		   consider the nodes to be used by scheduler */
+		alu_sched->array[idx].eligible = 1;
+		limits_fn = alu_sched->limits_fn;
+		while (limits_fn){
+			if (limits_fn->max_value && 
+			    (limits_fn->cur_value (trav_stats) > 
+			     limits_fn->max_value (&(alu_sched->spec_limit)))) {
+				alu_sched->array[idx].eligible = 0;
+			}
+			if (limits_fn->min_value && 
+			    (limits_fn->cur_value (trav_stats) < 
+			     limits_fn->min_value (&(alu_sched->spec_limit)))) {
+				alu_sched->array[idx].eligible = 0;
+			}
+			limits_fn = limits_fn->next;
+		}
+
+		/* Select minimum and maximum disk_usage */
+		if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) {
+			alu_sched->max_limit.disk_usage = trav_stats->disk_usage;
+		}
+		if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) {
+			alu_sched->min_limit.disk_usage = trav_stats->disk_usage;
+		}
+
+		/* Select minimum and maximum disk_speed */
+		if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) {
+			alu_sched->max_limit.disk_speed = trav_stats->disk_speed;
+		}
+		if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) {
+			alu_sched->min_limit.disk_speed = trav_stats->disk_speed;
+		}
+
+		/* Select minimum and maximum number of open files */
+		if (trav_stats->nr_files > alu_sched->max_limit.nr_files) {
+			alu_sched->max_limit.nr_files = trav_stats->nr_files;
+		}
+		if (trav_stats->nr_files < alu_sched->min_limit.nr_files) {
+			alu_sched->min_limit.nr_files = trav_stats->nr_files;
+		}
+
+		/* Select minimum and maximum write-usage */
+		if (trav_stats->write_usage > alu_sched->max_limit.write_usage) {
+			alu_sched->max_limit.write_usage = trav_stats->write_usage;
+		}
+		if (trav_stats->write_usage < alu_sched->min_limit.write_usage) {
+			alu_sched->min_limit.write_usage = trav_stats->write_usage;
+		}
+
+		/* Select minimum and maximum read-usage */
+		if (trav_stats->read_usage > alu_sched->max_limit.read_usage) {
+			alu_sched->max_limit.read_usage = trav_stats->read_usage;
+		}
+		if (trav_stats->read_usage < alu_sched->min_limit.read_usage) {
+			alu_sched->min_limit.read_usage = trav_stats->read_usage;
+		}
+
+		/* Select minimum and maximum free-disk */
+		if (trav_stats->free_disk > alu_sched->max_limit.free_disk) {
+			alu_sched->max_limit.free_disk = trav_stats->free_disk;
+		}
+		if (trav_stats->free_disk < alu_sched->min_limit.free_disk) {
+			alu_sched->min_limit.free_disk = trav_stats->free_disk;
+		}
+	}
+
+	STACK_DESTROY (frame->root);
+
+	return 0;
+}
+
+static void 
+update_stat_array (xlator_t *xl)
+{
+	/* This function schedules the file in one of the child nodes */
+	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+	int32_t idx = 0;
+	call_frame_t *frame = NULL;
+	call_pool_t *pool = xl->ctx->pool;
+
+	for (idx = 0 ; idx < alu_sched->child_count; idx++) {
+		frame = create_frame (xl, pool);
+
+		STACK_WIND_COOKIE (frame,
+				   update_stat_array_cbk, 
+				   alu_sched->array[idx].xl, //cookie
+				   alu_sched->array[idx].xl, 
+				   (alu_sched->array[idx].xl)->mops->stats,
+				   0); //flag
+	}
+	return;
+}
+
+static void 
+alu_update (xlator_t *xl)
+{
+	struct timeval tv;
+	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+
+	gettimeofday (&tv, NULL);
+	if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) {
+		/* Update the stats from all the server */
+		update_stat_array (xl);
+		alu_sched->last_stat_fetch.tv_sec = tv.tv_sec;
+	}
+}
+
+static xlator_t *
+alu_scheduler (xlator_t *xl, const void *path)
+{
+	/* This function schedules the file in one of the child nodes */
+	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private);
+	int32_t sched_index = 0;
+	int32_t sched_index_orig = 0;
+	int32_t idx = 0;
+
+	alu_update (xl);
+
+	/* Now check each threshold one by one if some nodes are classified */
+	{
+		struct alu_threshold *trav_threshold = alu_sched->threshold_fn;
+		struct alu_threshold *tmp_threshold = alu_sched->sched_method;
+		struct alu_sched_node *tmp_sched_node;   
+
+		/* This pointer 'trav_threshold' contains function pointers according to spec file
+		   give by user, */
+		while (trav_threshold) {
+			/* This check is needed for seeing if already there are nodes in this criteria 
+			   to be scheduled */
+			if (!alu_sched->sched_nodes_pending) {
+				for (idx = 0; idx < alu_sched->child_count; idx++) {
+					if (!alu_sched->array[idx].eligible) {
+						continue;
+					}
+					if (trav_threshold->entry_value) {
+						if (trav_threshold->diff_value (&(alu_sched->max_limit),
+										&(alu_sched->array[idx].stats)) <
+						    trav_threshold->entry_value (&(alu_sched->entry_limit))) {
+							continue;
+						}
+					}
+					tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node));
+					ERR_ABORT (tmp_sched_node);
+					tmp_sched_node->index = idx;
+					if (!alu_sched->sched_node) {
+						alu_sched->sched_node = tmp_sched_node;
+					} else {
+						pthread_mutex_lock (&alu_sched->alu_mutex);
+						tmp_sched_node->next = alu_sched->sched_node;
+						alu_sched->sched_node = tmp_sched_node;
+						pthread_mutex_unlock (&alu_sched->alu_mutex);
+					}
+					alu_sched->sched_nodes_pending++;
+				}
+			} /* end of if (sched_nodes_pending) */
+
+			/* This loop is required to check the eligible nodes */
+			struct alu_sched_node *trav_sched_node;
+			while (alu_sched->sched_nodes_pending) {
+				trav_sched_node = alu_sched->sched_node;
+				sched_index = trav_sched_node->index;
+				if (alu_sched->array[sched_index].eligible)
+					break;
+				alu_sched->sched_node = trav_sched_node->next;
+				free (trav_sched_node);
+				alu_sched->sched_nodes_pending--;
+			}
+			if (alu_sched->sched_nodes_pending) {
+				/* There are some node in this criteria to be scheduled, no need 
+				 * to sort and check other methods 
+				 */
+				if (tmp_threshold && tmp_threshold->exit_value) {
+					/* verify the exit value && whether node is eligible or not */
+					if (tmp_threshold->diff_value (&(alu_sched->max_limit),
+								       &(alu_sched->array[sched_index].stats)) >
+					    tmp_threshold->exit_value (&(alu_sched->exit_limit))) {
+						/* Free the allocated info for the node :) */
+						pthread_mutex_lock (&alu_sched->alu_mutex);
+						alu_sched->sched_node = trav_sched_node->next;
+						free (trav_sched_node);
+						trav_sched_node = alu_sched->sched_node;
+						alu_sched->sched_nodes_pending--;
+						pthread_mutex_unlock (&alu_sched->alu_mutex);
+					}
+				} else {
+					/* if there is no exit value, then exit after scheduling once */
+					pthread_mutex_lock (&alu_sched->alu_mutex);
+					alu_sched->sched_node = trav_sched_node->next;
+					free (trav_sched_node);
+					trav_sched_node = alu_sched->sched_node;
+					alu_sched->sched_nodes_pending--;
+					pthread_mutex_unlock (&alu_sched->alu_mutex);
+				}
+	
+				alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */
+
+				/* */
+				if (trav_sched_node) {
+					tmp_sched_node = trav_sched_node;
+					while (trav_sched_node->next) {
+						trav_sched_node = trav_sched_node->next;
+					}
+					if (tmp_sched_node->next) {
+						pthread_mutex_lock (&alu_sched->alu_mutex);
+						alu_sched->sched_node = tmp_sched_node->next;
+						tmp_sched_node->next = NULL;
+						trav_sched_node->next = tmp_sched_node;
+						pthread_mutex_unlock (&alu_sched->alu_mutex);
+					}
+				}
+				/* return the scheduled node */
+				return alu_sched->array[sched_index].xl;
+			} /* end of if (pending_nodes) */
+      
+			tmp_threshold = trav_threshold;
+			trav_threshold = trav_threshold->next;
+		}
+	}
+  
+	/* This is used only when there is everything seems ok, or no eligible nodes */
+	sched_index_orig = alu_sched->sched_index;
+	alu_sched->sched_method = NULL;
+	while (1) {
+		//lock
+		pthread_mutex_lock (&alu_sched->alu_mutex);
+		sched_index = alu_sched->sched_index++;
+		alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count;
+		pthread_mutex_unlock (&alu_sched->alu_mutex);
+		//unlock
+		if (alu_sched->array[sched_index].eligible)
+			break;
+		if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) {
+			gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule");
+			//lock
+			pthread_mutex_lock (&alu_sched->alu_mutex);
+			alu_sched->sched_index++;
+			alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count;
+			pthread_mutex_unlock (&alu_sched->alu_mutex);
+			//unlock
+			break;
+		}
+	}
+	return alu_sched->array[sched_index].xl;
+}
+
+/**
+ * notify
+ */
+void
+alu_notify (xlator_t *xl, int32_t event, void *data)
+{
+	struct alu_sched *alu_sched = NULL; 
+	int32_t idx = 0;
+  
+	alu_sched = (struct alu_sched *)*((long *)xl->private);
+	if (!alu_sched)
+		return;
+
+	for (idx = 0; idx < alu_sched->child_count; idx++) {
+		if (alu_sched->array[idx].xl == (xlator_t *)data)
+			break;
+	}
+
+	switch (event)
+	{
+	case GF_EVENT_CHILD_UP:
+	{
+		//alu_sched->array[idx].eligible = 1;
+	}
+	break;
+	case GF_EVENT_CHILD_DOWN:
+	{
+		alu_sched->array[idx].eligible = 0;
+	}
+	break;
+	default:
+	{
+		;
+	}
+	break;
+	}
+
+}
+
+struct sched_ops sched = {
+	.init     = alu_init,
+	.fini     = alu_fini,
+	.update   = alu_update,
+	.schedule = alu_scheduler,
+	.notify   = alu_notify
+};
+
+struct volume_options options[] = {
+	{ .key   = { "scheduler.alu.order", "alu.order" },  
+	  .type  = GF_OPTION_TYPE_ANY 
+	},
+	{ .key   = { "scheduler.alu.disk-usage.entry-threshold", 
+		     "alu.disk-usage.entry-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET
+	},
+	{ .key   = { "scheduler.alu.disk-usage.exit-threshold", 
+		     "alu.disk-usage.exit-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET
+	},
+	{ .key   = { "scheduler.alu.write-usage.entry-threshold", 
+		     "alu.write-usage.entry-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET
+	},
+	{ .key   = { "scheduler.alu.write-usage.exit-threshold", 
+		     "alu.write-usage.exit-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET 
+	},
+	{ .key   = { "scheduler.alu.read-usage.entry-threshold", 
+		     "alu.read-usage.entry-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET
+	},
+	{ .key   = { "scheduler.alu.read-usage.exit-threshold", 
+		     "alu.read-usage.exit-threshold" },  
+	  .type  = GF_OPTION_TYPE_SIZET 
+	},
+	{ .key   = { "scheduler.alu.open-files-usage.entry-threshold", 
+		     "alu.open-files-usage.entry-threshold" },  
+	  .type  = GF_OPTION_TYPE_INT
+	},
+	{ .key   = { "scheduler.alu.open-files-usage.exit-threshold", 
+		     "alu.open-files-usage.exit-threshold" },  
+	  .type  = GF_OPTION_TYPE_INT 
+	},
+	{ .key   = { "scheduler.read-only-subvolumes",
+		     "alu.read-only-subvolumes" },  
+	  .type  = GF_OPTION_TYPE_ANY 
+	},
+	{ .key   = { "scheduler.refresh-interval", 
+		     "alu.refresh-interval",
+		     "alu.stat-refresh.interval" },
+	  .type  = GF_OPTION_TYPE_TIME
+	},
+	{ .key   = { "scheduler.limits.min-free-disk", 
+		     "alu.limits.min-free-disk" },  
+	  .type  = GF_OPTION_TYPE_PERCENT
+	},
+	{ .key   = { "scheduler.alu.stat-refresh.num-file-create"
+		     "alu.stat-refresh.num-file-create"},  
+	  .type  = GF_OPTION_TYPE_INT
+	},
+	{ .key  =  {NULL}, }
+};
diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h
new file mode 100644
index 000000000..a958bb4d2
--- /dev/null
+++ b/scheduler/alu/src/alu.h
@@ -0,0 +1,89 @@
+/*
+   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+   This file is part of GlusterFS.
+
+   GlusterFS is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License,
+   or (at your option) any later version.
+
+   GlusterFS is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _ALU_H
+#define _ALU_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+
+struct alu_sched;
+
+struct alu_sched_struct {
+  xlator_t *xl;
+  struct xlator_stats stats;
+  unsigned char eligible;
+};
+
+// Write better name for these functions
+struct alu_limits {
+  struct alu_limits *next;
+  int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */
+  int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */
+  int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */
+};
+
+struct alu_threshold {
+  struct alu_threshold *next;
+  int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */
+  int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */
+  int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */
+  int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */
+};
+
+struct alu_sched_node {
+  struct alu_sched_node *next;
+  int32_t index;
+};
+
+struct alu_sched {
+  struct alu_limits *limits_fn;
+  struct alu_threshold *threshold_fn;
+  struct alu_sched_struct *array;
+  struct alu_sched_node *sched_node;
+  struct alu_threshold *sched_method;
+  struct xlator_stats max_limit;
+  struct xlator_stats min_limit;
+  struct xlator_stats entry_limit;
+  struct xlator_stats exit_limit;
+  struct xlator_stats spec_limit;     /* User given limit */
+
+  pthread_mutex_t alu_mutex;
+  struct timeval last_stat_fetch;
+  uint32_t refresh_interval;      /* in seconds */
+  uint32_t refresh_create_count;  /* num-file-create */
+
+  int32_t sched_nodes_pending;
+
+  int32_t sched_index;  /* used for round robin scheduling in case of many nodes getting the criteria match. */
+  int32_t child_count;
+
+};
+
+struct _alu_local_t {
+  int32_t call_count;
+};
+
+typedef struct _alu_local_t alu_local_t;
+
+#endif /* _ALU_H */
diff --git a/scheduler/nufa/Makefile.am b/scheduler/nufa/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/scheduler/nufa/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES = 
diff --git a/scheduler/nufa/src/Makefile.am b/scheduler/nufa/src/Makefile.am
new file mode 100644
index 000000000..6eb3d39f1
--- /dev/null
+++ b/scheduler/nufa/src/Makefile.am
@@ -0,0 +1,12 @@
+sched_LTLIBRARIES = nufa.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+nufa_la_LDFLAGS = -module -avoidversion
+
+nufa_la_SOURCES = nufa.c
+nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/scheduler/nufa/src/nufa.c b/scheduler/nufa/src/nufa.c
new file mode 100644
index 000000000..bbc61e2ad
--- /dev/null
+++ b/scheduler/nufa/src/nufa.c
@@ -0,0 +1,403 @@
+/*
+  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "common-utils.h"
+
+struct nufa_sched_struct {
+	xlator_t *xl;
+	struct timeval last_stat_fetch;
+	int64_t free_disk;
+	int32_t refresh_interval;
+	unsigned char eligible;
+};
+
+struct nufa_struct {
+	struct nufa_sched_struct *array;
+	struct timeval last_stat_fetch;
+
+	int32_t *local_array; /* Used to keep the index of the local xlators */
+	int32_t local_xl_index; /* index in the above array */
+	int32_t local_xl_count; /* Count of the local subvolumes */
+
+	uint32_t refresh_interval;
+	uint32_t min_free_disk;
+  
+	gf_lock_t nufa_lock;
+	int32_t child_count;
+	int32_t sched_index;  
+};
+
+#define NUFA_LIMITS_MIN_FREE_DISK_DEFAULT    15
+#define NUFA_REFRESH_INTERVAL_DEFAULT        30
+
+static int32_t
+nufa_init (xlator_t *xl)
+{
+	int32_t index = 0;
+	data_t *local_name = NULL;
+	data_t *data = NULL;
+	xlator_list_t *trav_xl = xl->children;
+	struct nufa_struct *nufa_buf = NULL;
+
+	nufa_buf = CALLOC (1, sizeof (struct nufa_struct));
+	ERR_ABORT (nufa_buf);
+
+	data = dict_get (xl->options, "scheduler.limits.min-free-disk");
+	if (data) {
+		if (gf_string2percent (data->data, 
+				       &nufa_buf->min_free_disk) != 0) {
+			gf_log ("nufa", GF_LOG_ERROR, 
+				"invalid number format \"%s\" of "
+				"\"option scheduler.limits.min-free-disk\"", 
+				data->data);
+			return -1;
+		}
+		if (nufa_buf->min_free_disk >= 100) {
+			gf_log ("nufa", GF_LOG_ERROR,
+				"check \"option scheduler.limits.min-free-disk"
+				"\", it should be percentage value");
+			return -1;
+		}
+	} else {
+		gf_log ("nufa", GF_LOG_WARNING, 
+			"No option for limit min-free-disk given, "
+			"defaulting it to 15%%");
+		nufa_buf->min_free_disk = NUFA_LIMITS_MIN_FREE_DISK_DEFAULT;
+	}
+	data = dict_get (xl->options, "scheduler.refresh-interval");
+	if (data && (gf_string2time (data->data, 
+				    &nufa_buf->refresh_interval) != 0)) {
+		gf_log ("nufa", GF_LOG_ERROR, 
+			"invalid number format \"%s\" of "
+			"\"option scheduler.refresh-interval\"", 
+			data->data);
+		return -1;
+	} else {
+		gf_log ("nufa", GF_LOG_WARNING, 
+			"No option for scheduler.refresh-interval given, "
+			"defaulting it to 30");
+		nufa_buf->refresh_interval = NUFA_REFRESH_INTERVAL_DEFAULT;
+	}
+  
+	/* Get the array built */
+	while (trav_xl) {
+		index++;
+		trav_xl = trav_xl->next;
+	}
+	nufa_buf->child_count = index;
+	nufa_buf->sched_index = 0;
+	nufa_buf->array = CALLOC (index, sizeof (struct nufa_sched_struct));
+	ERR_ABORT (nufa_buf->array);
+	nufa_buf->local_array = CALLOC (index, sizeof (int32_t));
+	ERR_ABORT (nufa_buf->array);
+	trav_xl = xl->children;
+	
+	local_name = dict_get (xl->options, "scheduler.local-volume-name");
+	if (!local_name) {
+		/* Error */
+		gf_log ("nufa", GF_LOG_ERROR, 
+			"No 'local-volume-name' option given in volume file");
+		FREE (nufa_buf->array);
+		FREE (nufa_buf->local_array);
+		FREE (nufa_buf);
+		return -1;
+	}
+
+	/* Get the array properly */
+	index = 0;
+	trav_xl = xl->children;
+	while (trav_xl) {
+		nufa_buf->array[index].xl = trav_xl->xlator;
+		nufa_buf->array[index].eligible = 1;
+		nufa_buf->array[index].free_disk = nufa_buf->min_free_disk;
+		nufa_buf->array[index].refresh_interval = 
+			nufa_buf->refresh_interval;
+
+		trav_xl = trav_xl->next;
+		index++;
+	}
+  
+	{ 
+		int32_t array_index = 0;
+		char *child = NULL;
+		char *tmp = NULL;
+		char *childs_data = strdup (local_name->data);
+		
+		child = strtok_r (childs_data, ",", &tmp);
+		while (child) {
+			/* Check if the local_volume specified is proper 
+			   subvolume of unify */
+			trav_xl = xl->children;
+			index=0;
+			while (trav_xl) {
+				if (strcmp (child, trav_xl->xlator->name) == 0)
+					break;
+				trav_xl = trav_xl->next;
+				index++;
+			}
+
+			if (!trav_xl) {
+				/* entry for 'local-volume-name' is wrong, 
+				   not present in subvolumes */
+				gf_log ("nufa", GF_LOG_ERROR, 
+					"option 'scheduler.local-volume-name' "
+					"%s is wrong", child);
+				FREE (nufa_buf->array);
+				FREE (nufa_buf->local_array);
+				FREE (nufa_buf);
+				return -1;
+			} else {
+				nufa_buf->local_array[array_index++] = index;
+				nufa_buf->local_xl_count++;
+			}
+			child = strtok_r (NULL, ",", &tmp);
+		}
+		free (childs_data);
+	}
+
+	LOCK_INIT (&nufa_buf->nufa_lock);
+	*((long *)xl->private) = (long)nufa_buf; // put it at the proper place
+	return 0;
+}
+
+static void
+nufa_fini (xlator_t *xl)
+{
+	struct nufa_struct *nufa_buf = 
+		(struct nufa_struct *)*((long *)xl->private);
+
+	LOCK_DESTROY (&nufa_buf->nufa_lock);
+	FREE (nufa_buf->local_array);
+	FREE (nufa_buf->array);
+	FREE (nufa_buf);
+}
+
+static int32_t 
+update_stat_array_cbk (call_frame_t *frame,
+		       void *cookie,
+		       xlator_t *xl,
+		       int32_t op_ret,
+		       int32_t op_errno,
+		       struct xlator_stats *trav_stats)
+{
+	struct nufa_struct *nufa_struct = NULL; 
+	int32_t idx = 0;
+	int32_t percent = 0;
+
+	nufa_struct = (struct nufa_struct *)*((long *)xl->private);
+	LOCK (&nufa_struct->nufa_lock);
+	for (idx = 0; idx < nufa_struct->child_count; idx++) {
+		if (nufa_struct->array[idx].xl->name == (char *)cookie)
+			break;
+	}
+	UNLOCK (&nufa_struct->nufa_lock);
+
+	if (op_ret == 0) {
+		percent = ((trav_stats->free_disk * 100) / 
+			   trav_stats->total_disk_size);
+		if (nufa_struct->array[idx].free_disk > percent) {
+			if (nufa_struct->array[idx].eligible)
+				gf_log ("nufa", GF_LOG_CRITICAL,
+					"node \"%s\" is _almost_ (%d %%) full",
+					nufa_struct->array[idx].xl->name, 
+					100 - percent);
+			nufa_struct->array[idx].eligible = 0;
+		} else {
+			nufa_struct->array[idx].eligible = 1;
+		}
+	} else {
+		nufa_struct->array[idx].eligible = 0;
+	}
+
+	STACK_DESTROY (frame->root);
+	return 0;
+}
+
+static void 
+update_stat_array (xlator_t *xl)
+{
+	/* This function schedules the file in one of the child nodes */
+	int32_t idx;
+	struct nufa_struct *nufa_buf = 
+		(struct nufa_struct *)*((long *)xl->private);
+	call_frame_t *frame = NULL;
+	call_pool_t *pool = xl->ctx->pool;
+
+	for (idx = 0; idx < nufa_buf->child_count; idx++) {
+		frame = create_frame (xl, pool);
+
+		STACK_WIND_COOKIE (frame,
+				   update_stat_array_cbk, 
+				   nufa_buf->array[idx].xl->name,
+				   nufa_buf->array[idx].xl, 
+				   (nufa_buf->array[idx].xl)->mops->stats,
+				   0); //flag
+	}
+
+	return;
+}
+
+static void 
+nufa_update (xlator_t *xl)
+{
+	struct nufa_struct *nufa_buf = 
+		(struct nufa_struct *)*((long *)xl->private);
+	struct timeval tv;
+	gettimeofday (&tv, NULL);
+	if (tv.tv_sec > (nufa_buf->refresh_interval 
+			 + nufa_buf->last_stat_fetch.tv_sec)) {
+		/* Update the stats from all the server */
+		update_stat_array (xl);
+		nufa_buf->last_stat_fetch.tv_sec = tv.tv_sec;
+	}
+}
+
+static xlator_t *
+nufa_schedule (xlator_t *xl, const void *path)
+{
+	struct nufa_struct *nufa_buf = 
+		(struct nufa_struct *)*((long *)xl->private);
+	int32_t nufa_orig = nufa_buf->local_xl_index;  
+	int32_t rr;
+  
+	nufa_update (xl);
+  
+	while (1) {
+		LOCK (&nufa_buf->nufa_lock);
+		rr = nufa_buf->local_xl_index++;
+		nufa_buf->local_xl_index %= nufa_buf->local_xl_count;
+		UNLOCK (&nufa_buf->nufa_lock);
+    
+		/* if 'eligible' or there are _no_ eligible nodes */
+		if (nufa_buf->array[nufa_buf->local_array[rr]].eligible) {
+			/* Return the local node */
+			return nufa_buf->array[nufa_buf->local_array[rr]].xl;
+		}
+		if ((rr + 1) % nufa_buf->local_xl_count == nufa_orig) {
+			gf_log ("nufa", GF_LOG_CRITICAL, 
+				"No free space available on any local "
+				"volumes, using RR scheduler");
+			LOCK (&nufa_buf->nufa_lock);
+			nufa_buf->local_xl_index++;
+			nufa_buf->local_xl_index %= nufa_buf->local_xl_count;
+			UNLOCK (&nufa_buf->nufa_lock);
+			break;
+		}
+	}
+
+	nufa_orig = nufa_buf->sched_index;  
+	while (1) {
+		LOCK (&nufa_buf->nufa_lock);
+		rr = nufa_buf->sched_index++;
+		nufa_buf->sched_index = (nufa_buf->sched_index % 
+					 nufa_buf->child_count);
+		UNLOCK (&nufa_buf->nufa_lock);
+    
+		/* if 'eligible' or there are _no_ eligible nodes */
+		if (nufa_buf->array[rr].eligible) {
+			break;
+		}
+		if ((rr + 1) % nufa_buf->child_count == nufa_orig) {
+			gf_log ("nufa", GF_LOG_CRITICAL, 
+				"No free space available on any server, "
+				"using RR scheduler.");
+			LOCK (&nufa_buf->nufa_lock);
+			nufa_buf->sched_index++;
+			nufa_buf->sched_index = (nufa_buf->sched_index % 
+						 nufa_buf->child_count);
+			UNLOCK (&nufa_buf->nufa_lock);
+			break;
+		}
+	}
+	return nufa_buf->array[rr].xl;
+}
+
+
+/**
+ * notify
+ */
+void
+nufa_notify (xlator_t *xl, int32_t event, void *data)
+{
+	struct nufa_struct *nufa_buf = 
+		(struct nufa_struct *)*((long *)xl->private);
+	int32_t idx = 0;
+  
+	if (!nufa_buf)
+		return;
+
+	for (idx = 0; idx < nufa_buf->child_count; idx++) {
+		if (strcmp (nufa_buf->array[idx].xl->name, 
+			    ((xlator_t *)data)->name) == 0)
+			break;
+	}
+
+	switch (event)
+	{
+	case GF_EVENT_CHILD_UP:
+	{
+		//nufa_buf->array[idx].eligible = 1;
+	}
+	break;
+	case GF_EVENT_CHILD_DOWN:
+	{
+		nufa_buf->array[idx].eligible = 0;
+	}
+	break;
+	default:
+	{
+		;
+	}
+	break;
+	}
+
+}
+
+struct sched_ops sched = {
+	.init     = nufa_init,
+	.fini     = nufa_fini,
+	.update   = nufa_update,
+	.schedule = nufa_schedule,
+	.notify   = nufa_notify
+};
+
+struct volume_options options[] = {
+	{ .key   = { "scheduler.refresh-interval", 
+		     "nufa.refresh-interval" },  
+	  .type  = GF_OPTION_TYPE_TIME
+	},
+	{ .key   = { "scheduler.limits.min-free-disk", 
+		     "nufa.limits.min-free-disk" },  
+	  .type  = GF_OPTION_TYPE_PERCENT
+	},
+	{ .key   = { "scheduler.local-volume-name",
+		     "nufa.local-volume-name" },
+	  .type  = GF_OPTION_TYPE_XLATOR
+	},	
+	{ .key = {NULL} }
+};
+
diff --git a/scheduler/random/Makefile.am b/scheduler/random/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/scheduler/random/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES = 
diff --git a/scheduler/random/src/Makefile.am b/scheduler/random/src/Makefile.am
new file mode 100644
index 000000000..572181336
--- /dev/null
+++ b/scheduler/random/src/Makefile.am
@@ -0,0 +1,14 @@
+sched_LTLIBRARIES = random.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+random_la_LDFLAGS = -module -avoidversion
+
+random_la_SOURCES = random.c
+random_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = random.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/scheduler/random/src/random.c b/scheduler/random/src/random.c
new file mode 100644
index 000000000..9e761d08a
--- /dev/null
+++ b/scheduler/random/src/random.c
@@ -0,0 +1,283 @@
+/*
+  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdlib.h>
+#include <sys/time.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "random.h"
+
+#define RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT    15
+#define RANDOM_REFRESH_INTERVAL_DEFAULT        10
+
+
+static int32_t
+random_init (xlator_t *xl)
+{
+	struct random_struct *random_buf = NULL;
+	xlator_list_t *trav_xl = xl->children;
+	data_t *limit = NULL;
+	int32_t index = 0;
+
+	random_buf = CALLOC (1, sizeof (struct random_struct));
+	ERR_ABORT (random_buf);
+  
+	/* Set the seed for the 'random' function */
+	srandom ((uint32_t) time (NULL));
+  
+	limit = dict_get (xl->options, "scheduler.limits.min-free-disk");
+	if (limit) {
+		if (gf_string2percent (data_to_str (limit),
+				       &random_buf->min_free_disk) != 0) {
+			gf_log ("random", GF_LOG_ERROR, 
+				"invalid number format \"%s\" of \"option "
+				"scheduler.limits.min-free-disk\"", 
+				limit->data);
+			return -1;
+		}
+		if (random_buf->min_free_disk >= 100) {
+			gf_log ("random", GF_LOG_ERROR,
+				"check the \"option random.limits.min-free"
+				"-disk\", it should be percentage value");
+			return -1;
+		}
+      
+	} else {
+		gf_log ("random", GF_LOG_WARNING, 
+			"No option for limit min-free-disk given, "
+			"defaulting it to 5%%");
+		random_buf->min_free_disk = 
+			RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT;
+	}
+  
+	limit = dict_get (xl->options, "scheduler.refresh-interval");
+	if (limit) {
+		if (gf_string2time (data_to_str (limit),
+				    &random_buf->refresh_interval) != 0) {
+			gf_log ("random", GF_LOG_ERROR, 
+				"invalid number format \"%s\" of "
+				"\"option random.refresh-interval\"", 
+				limit->data);
+			return -1;
+		}
+	} else {
+		random_buf->refresh_interval = RANDOM_REFRESH_INTERVAL_DEFAULT;
+	}
+  
+	while (trav_xl) {
+		index++;
+		trav_xl = trav_xl->next;
+	}
+	random_buf->child_count = index;
+	random_buf->array = CALLOC (index, 
+				    sizeof (struct random_sched_struct));
+	ERR_ABORT (random_buf->array);
+	trav_xl = xl->children;
+	index = 0;
+
+	while (trav_xl) {
+		random_buf->array[index].xl = trav_xl->xlator;
+		random_buf->array[index].eligible = 1;
+		trav_xl = trav_xl->next;
+		index++;
+	}
+	pthread_mutex_init (&random_buf->random_mutex, NULL);
+
+	// put it at the proper place  
+	*((long *)xl->private) = (long)random_buf; 
+	return 0;
+}
+
+static void
+random_fini (xlator_t *xl)
+{
+	struct random_struct *random_buf = NULL;
+
+	random_buf = (struct random_struct *)*((long *)xl->private);
+	pthread_mutex_destroy (&random_buf->random_mutex);
+	free (random_buf->array);
+	free (random_buf);
+}
+
+
+static int32_t 
+update_stat_array_cbk (call_frame_t *frame,
+		       void *cookie,
+		       xlator_t *xl,
+		       int32_t op_ret,
+		       int32_t op_errno,
+		       struct xlator_stats *trav_stats)
+{
+	int32_t idx = 0;
+	int32_t percent = 0;
+	struct random_struct *random_buf = NULL;
+	
+	random_buf = (struct random_struct *)*((long *)xl->private);
+
+	pthread_mutex_lock (&random_buf->random_mutex);
+	for (idx = 0; idx < random_buf->child_count; idx++) {
+		if (strcmp (random_buf->array[idx].xl->name, 
+			    (char *)cookie) == 0)
+			break;
+	}
+	pthread_mutex_unlock (&random_buf->random_mutex);
+
+	if (op_ret == 0) {
+		percent = ((trav_stats->free_disk *100) / 
+			   trav_stats->total_disk_size);
+		if (random_buf->min_free_disk > percent) {
+			random_buf->array[idx].eligible = 0;
+		} else {
+			random_buf->array[idx].eligible = 1;
+		}
+	} else {
+		random_buf->array[idx].eligible = 0;
+	}    
+
+	STACK_DESTROY (frame->root);
+	return 0;
+}
+
+static void 
+update_stat_array (xlator_t *xl)
+{
+	int32_t idx;
+	struct random_struct *random_buf = NULL;
+	call_frame_t *frame = NULL;
+	call_pool_t *pool = xl->ctx->pool;
+
+	random_buf = (struct random_struct *)*((long *)xl->private);
+	for (idx = 0; idx < random_buf->child_count; idx++) {
+		frame = create_frame (xl, pool);
+
+		STACK_WIND_COOKIE (frame,
+				   update_stat_array_cbk,
+				   random_buf->array[idx].xl->name,
+				   random_buf->array[idx].xl,
+				   (random_buf->array[idx].xl)->mops->stats,
+				   0);
+	}
+	return ;
+}
+
+static void 
+random_update (xlator_t *xl)
+{
+	struct timeval tv;
+	struct random_struct *random_buf = NULL;
+
+	random_buf = (struct random_struct *)*((long *)xl->private);
+
+	gettimeofday(&tv, NULL);
+	if (tv.tv_sec > (random_buf->refresh_interval + 
+			 random_buf->last_stat_entry.tv_sec)) {
+		update_stat_array (xl);
+		random_buf->last_stat_entry.tv_sec = tv.tv_sec;
+	}
+}
+
+static xlator_t *
+random_schedule (xlator_t *xl, const void *path)
+{
+	struct random_struct *random_buf = NULL;       
+	int32_t rand = 0;
+	int32_t try = 0;
+	
+	random_buf = (struct random_struct *)*((long *)xl->private);
+
+	rand = (int32_t) (1.0*random_buf->child_count * 
+			  (random() / (RAND_MAX + 1.0)));
+
+	random_update (xl);
+
+	while (!random_buf->array[rand].eligible) {
+		if (try++ > 100) {
+			/* there is a chance of this becoming a 
+			   infinite loop otherwise. */
+			break; 
+		}
+		rand = (int32_t) (1.0*random_buf->child_count * 
+				  (random() / (RAND_MAX + 1.0)));
+	}
+	return random_buf->array[rand].xl;
+}
+
+
+/**
+ * notify
+ */
+void
+random_notify (xlator_t *xl, int32_t event, void *data)
+{
+	struct random_struct *random_buf = NULL;
+	int32_t idx = 0;
+  
+	random_buf = (struct random_struct *)*((long *)xl->private);
+	if (!random_buf)
+		return;
+
+	for (idx = 0; idx < random_buf->child_count; idx++) {
+		if (random_buf->array[idx].xl == (xlator_t *)data)
+			break;
+	}
+
+	switch (event)
+	{
+	case GF_EVENT_CHILD_UP:
+	{
+		//random_buf->array[idx].eligible = 1;
+	}
+	break;
+	case GF_EVENT_CHILD_DOWN:
+	{
+		random_buf->array[idx].eligible = 0;
+	}
+	break;
+	default:
+	{
+		;
+	}
+	break;
+	}
+
+}
+
+struct sched_ops sched = {
+	.init     = random_init,
+	.fini     = random_fini,
+	.update   = random_update,
+	.schedule = random_schedule,
+	.notify   = random_notify
+};
+
+struct volume_options options[] = {
+	{ .key   = { "scheduler.refresh-interval", 
+		     "random.refresh-interval" },  
+	  .type  = GF_OPTION_TYPE_TIME
+	},
+	{ .key   = { "scheduler.limits.min-free-disk", 
+		     "random.limits.min-free-disk" },  
+	  .type  = GF_OPTION_TYPE_PERCENT
+	},
+	{ .key = {NULL} }
+};
diff --git a/scheduler/random/src/random.h b/scheduler/random/src/random.h
new file mode 100644
index 000000000..35c9e02ee
--- /dev/null
+++ b/scheduler/random/src/random.h
@@ -0,0 +1,46 @@
+/*
+   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+   This file is part of GlusterFS.
+
+   GlusterFS is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License,
+   or (at your option) any later version.
+
+   GlusterFS is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RANDOM_H
+#define _RANDOM_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <sys/time.h>
+#include "scheduler.h"
+
+struct random_sched_struct {
+  xlator_t *xl;
+  unsigned char eligible;
+};
+
+struct random_struct {
+  int32_t child_count;
+  uint32_t refresh_interval;
+  uint32_t min_free_disk;
+  struct timeval last_stat_entry;
+  struct random_sched_struct *array;
+  pthread_mutex_t random_mutex;
+};
+
+#endif /* _RANDOM_H */
diff --git a/scheduler/rr/Makefile.am b/scheduler/rr/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/scheduler/rr/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES = 
diff --git a/scheduler/rr/src/Makefile.am b/scheduler/rr/src/Makefile.am
new file mode 100644
index 000000000..7e911c0ed
--- /dev/null
+++ b/scheduler/rr/src/Makefile.am
@@ -0,0 +1,13 @@
+sched_LTLIBRARIES = rr.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+rr_la_LDFLAGS = -module -avoidversion
+
+rr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+rr_la_SOURCES = rr.c rr-options.c
+noinst_HEADERS = rr.h rr-options.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/scheduler/rr/src/rr-options.c b/scheduler/rr/src/rr-options.c
new file mode 100644
index 000000000..3f0ffcaf2
--- /dev/null
+++ b/scheduler/rr/src/rr-options.c
@@ -0,0 +1,256 @@
+/*
+  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+#include "scheduler.h"
+#include "rr-options.h"
+
+#define RR_LIMITS_MIN_FREE_DISK_OPTION_STRING  "scheduler.limits.min-free-disk"
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT  15
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_MIN      0
+#define RR_LIMITS_MIN_FREE_DISK_VALUE_MAX      100
+
+#define RR_REFRESH_INTERVAL_OPTION_STRING      "scheduler.refresh-interval"
+#define RR_REFRESH_INTERVAL_VALUE_DEFAULT      10
+
+#define RR_READ_ONLY_SUBVOLUMES_OPTION_STRING  "scheduler.read-only-subvolumes"
+
+#define LOG_ERROR(args...)      gf_log ("rr-options", GF_LOG_ERROR, ##args)
+#define LOG_WARNING(args...)    gf_log ("rr-options", GF_LOG_WARNING, ##args)
+
+static int 
+_rr_options_min_free_disk_validate (const char *value_string, uint32_t *n)
+{
+	uint32_t value = 0;
+  
+	if (value_string == NULL)
+	{
+		return -1;
+	}
+  
+	if (gf_string2percent (value_string, &value) != 0)
+	{
+		gf_log ("rr", 
+			GF_LOG_ERROR, 
+			"invalid number format [%s] of option [%s]", 
+			value_string, 
+			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING);
+		return -1;
+	}
+  
+	if ((value <= RR_LIMITS_MIN_FREE_DISK_VALUE_MIN) || 
+	    (value >= RR_LIMITS_MIN_FREE_DISK_VALUE_MAX))
+	{
+		gf_log ("rr", 
+			GF_LOG_ERROR, 
+			"out of range [%d] of option [%s].  Allowed range is 0 to 100.", 
+			value, 
+			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING);
+		return -1;
+	}
+  
+	*n = value;
+  
+	return 0;
+}
+
+static int 
+_rr_options_refresh_interval_validate (const char *value_string, uint32_t *n)
+{
+	uint32_t value = 0;
+  
+	if (value_string == NULL)
+	{
+		return -1;
+	}
+  
+	if (gf_string2time (value_string, &value) != 0)
+	{
+		gf_log ("rr", 
+			GF_LOG_ERROR, 
+			"invalid number format [%s] of option [%s]", 
+			value_string, 
+			RR_REFRESH_INTERVAL_OPTION_STRING);
+		return -1;
+	}
+  
+	*n = value;
+  
+	return 0;
+}
+
+static int 
+_rr_options_read_only_subvolumes_validate (const char *value_string, 
+					   char ***volume_list, 
+					   uint64_t *volume_count)
+{
+	char **vlist = NULL;
+	int vcount = 0;
+	int i = 0;
+  
+	if (value_string == NULL || volume_list == NULL || volume_count)
+	{
+		return -1;
+	}
+  
+	if (gf_strsplit (value_string, 
+			 ", ", 
+			 &vlist, 
+			 &vcount) != 0)
+	{
+		gf_log ("rr", 
+			GF_LOG_ERROR, 
+			"invalid subvolume list [%s] of option [%s]", 
+			value_string, 
+			RR_READ_ONLY_SUBVOLUMES_OPTION_STRING);
+		return -1;
+	}
+  
+	for (i = 0; i < vcount; i++)
+	{
+		if (gf_volume_name_validate (vlist[i]) != 0)
+		{
+			gf_log ("rr", 
+				GF_LOG_ERROR, 
+				"invalid subvolume name [%s] in [%s] of option [%s]", 
+				vlist[i], 
+				value_string, 
+				RR_READ_ONLY_SUBVOLUMES_OPTION_STRING);
+			goto free_exit;
+		}
+	}
+  
+	*volume_list = vlist;
+	*volume_count = vcount;
+  
+	return 0;
+  
+ free_exit:
+	for (i = 0; i < vcount; i++)
+	{
+		free (vlist[i]);
+	}
+	free (vlist);
+  
+	return -1;
+}
+
+int 
+rr_options_validate (dict_t *options, rr_options_t *rr_options)
+{
+	char *value_string = NULL;
+  
+	if (options == NULL || rr_options == NULL)
+	{
+		return -1;
+	}
+  
+	if (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING))
+		if (data_to_str (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)))
+			value_string = data_to_str (dict_get (options, 
+							      RR_LIMITS_MIN_FREE_DISK_OPTION_STRING));
+	if (value_string != NULL)
+	{
+		if (_rr_options_min_free_disk_validate (value_string, 
+							&rr_options->min_free_disk) != 0)
+		{
+			return -1;
+		}
+      
+		gf_log ("rr", 
+			GF_LOG_WARNING, 
+			"using %s = %d", 
+			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, 
+			rr_options->min_free_disk);
+	}
+	else 
+	{
+		rr_options->min_free_disk = RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT;
+      
+		gf_log ("rr", GF_LOG_DEBUG, 
+			"using %s = %d [default]", 
+			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, 
+			rr_options->min_free_disk);
+	}
+  
+	value_string = NULL;
+	if (dict_get (options, RR_REFRESH_INTERVAL_OPTION_STRING))
+		value_string = data_to_str (dict_get (options, 
+						      RR_REFRESH_INTERVAL_OPTION_STRING));
+	if (value_string != NULL)
+	{
+		if (_rr_options_refresh_interval_validate (value_string, 
+							   &rr_options->refresh_interval) != 0)
+		{
+			return -1;
+		}
+      
+		gf_log ("rr", 
+			GF_LOG_WARNING, 
+			"using %s = %d", 
+			RR_REFRESH_INTERVAL_OPTION_STRING, 
+			rr_options->refresh_interval);
+	}
+	else 
+	{
+		rr_options->refresh_interval = RR_REFRESH_INTERVAL_VALUE_DEFAULT;
+      
+		gf_log ("rr", GF_LOG_DEBUG, 
+			"using %s = %d [default]", 
+			RR_REFRESH_INTERVAL_OPTION_STRING, 
+			rr_options->refresh_interval);
+	}
+  
+	value_string = NULL;
+	if (dict_get (options, RR_READ_ONLY_SUBVOLUMES_OPTION_STRING))
+		value_string = data_to_str (dict_get (options, 
+						      RR_READ_ONLY_SUBVOLUMES_OPTION_STRING));
+	if (value_string != NULL)
+	{
+		if (_rr_options_read_only_subvolumes_validate (value_string, 
+							       &rr_options->read_only_subvolume_list, 
+							       &rr_options->read_only_subvolume_count) != 0)
+		{
+			return -1;
+		}
+      
+		gf_log ("rr", 
+			GF_LOG_WARNING, 
+			"using %s = [%s]", 
+			RR_READ_ONLY_SUBVOLUMES_OPTION_STRING, 
+			value_string);
+	}
+  
+	return 0;
+}
+
+struct volume_options options[] = {
+	{ .key   = { "scheduler.refresh-interval", 
+		     "rr.refresh-interval" },  
+	  .type  = GF_OPTION_TYPE_TIME
+	},
+	{ .key   = { "scheduler.limits.min-free-disk", 
+		     "rr.limits.min-free-disk" },  
+	  .type  = GF_OPTION_TYPE_PERCENT
+	},
+	{ .key   = { "scheduler.read-only-subvolumes", 
+		     "rr.read-only-subvolumes" },  
+	  .type  = GF_OPTION_TYPE_ANY
+	},
+	{ .key = {NULL} }
+};
diff --git a/scheduler/rr/src/rr-options.h b/scheduler/rr/src/rr-options.h
new file mode 100644
index 000000000..4818c7d49
--- /dev/null
+++ b/scheduler/rr/src/rr-options.h
@@ -0,0 +1,34 @@
+/*
+   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+   This file is part of GlusterFS.
+
+   GlusterFS is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License,
+   or (at your option) any later version.
+
+   GlusterFS is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RR_OPTIONS_H
+#define _RR_OPTIONS_H
+
+struct rr_options
+{
+  uint32_t min_free_disk;
+  uint32_t refresh_interval;
+  char     **read_only_subvolume_list;
+  uint64_t read_only_subvolume_count;
+};
+typedef struct rr_options rr_options_t;
+
+int rr_options_validate (dict_t *options, rr_options_t *rr_options);
+
+#endif
diff --git a/scheduler/rr/src/rr.c b/scheduler/rr/src/rr.c
new file mode 100644
index 000000000..3e54ff5e1
--- /dev/null
+++ b/scheduler/rr/src/rr.c
@@ -0,0 +1,565 @@
+/*
+  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/time.h>
+#include <stdlib.h>
+
+#include <stdint.h>
+
+#include "scheduler.h"
+
+#include "rr-options.h"
+#include "rr.h"
+
+#define RR_MIN_FREE_DISK_NOT_REACHED    0
+#define RR_MIN_FREE_DISK_REACHED        1
+
+#define RR_SUBVOLUME_OFFLINE    0
+#define RR_SUBVOLUME_ONLINE     1
+
+#define LOG_ERROR(args...)      gf_log ("rr", GF_LOG_ERROR, ##args)
+#define LOG_WARNING(args...)    gf_log ("rr", GF_LOG_WARNING, ##args)
+#define LOG_CRITICAL(args...)    gf_log ("rr", GF_LOG_CRITICAL, ##args)
+
+#define ROUND_ROBIN(index, count)    ((index + 1) % count)
+
+static int 
+_cleanup_rr (rr_t *rr)
+{
+	int i;
+  
+	if (rr == NULL)
+	{
+		return -1;
+	}
+  
+	if (rr->options.read_only_subvolume_list != NULL)
+	{
+		for (i = 0; i < rr->options.read_only_subvolume_count; i++)
+		{
+			free (rr->options.read_only_subvolume_list[i]);
+		}
+		free (rr->options.read_only_subvolume_list);
+	}
+  
+	free (rr->subvolume_list);
+  
+	free (rr);
+  
+	return 0;
+}
+
+int 
+rr_init (xlator_t *this_xl)
+{
+	rr_t *rr = NULL;
+	dict_t *options = NULL;
+	xlator_list_t *children = NULL;
+	uint64_t children_count = 0;
+	int i = 0;
+	int j = 0;
+  
+	if (this_xl == NULL)
+	{
+		return -1;
+	}
+  
+	if ((options = this_xl->options) == NULL)
+	{
+		return -1;
+	}
+  
+	if ((children = this_xl->children) == NULL)
+	{
+		return -1;
+	}
+  
+	if ((rr = CALLOC (1, sizeof (rr_t))) == NULL)
+	{
+		return -1;
+	}
+  
+	if (rr_options_validate (options, &rr->options) != 0)
+	{
+		free (rr);
+		return -1;
+	}
+  
+	for (i = 0; i < rr->options.read_only_subvolume_count; i++)
+	{
+		char found = 0;
+      
+		for (children = this_xl->children; 
+		     children != NULL; 
+		     children = children->next)
+		{
+			if (strcmp (rr->options.read_only_subvolume_list[i], 
+				    children->xlator->name) == 0)
+			{
+				found = 1;
+				break;
+			}
+		}
+      
+		if (!found)
+		{
+			LOG_ERROR ("read-only subvolume [%s] not found in volume list", 
+				   rr->options.read_only_subvolume_list[i]);
+			_cleanup_rr (rr);
+			return -1;
+		}
+	}
+  
+	for (children = this_xl->children; 
+	     children != NULL; 
+	     children = children->next)
+	{
+		children_count++;
+	}
+  
+	/* bala: excluding read_only_subvolumes */
+	if ((rr->subvolume_count = children_count - 
+	     rr->options.read_only_subvolume_count) == 0)
+	{
+		LOG_ERROR ("no writable volumes found for scheduling");
+		_cleanup_rr (rr);
+		return -1;
+	}
+  
+	if ((rr->subvolume_list = CALLOC (rr->subvolume_count, 
+					  sizeof (rr_subvolume_t))) == NULL)
+	{
+		_cleanup_rr (rr);
+		return -1;
+	}
+  
+	i = 0;
+	j = 0;
+	for (children = this_xl->children; 
+	     children != NULL; 
+	     children = children->next)
+	{
+		char found = 0;
+      
+		for (j = 0; j < rr->options.read_only_subvolume_count; j++)
+		{
+			if (strcmp (rr->options.read_only_subvolume_list[i], 
+				    children->xlator->name) == 0)
+			{
+				found = 1;
+				break;
+			}
+		}
+      
+		if (!found)
+		{
+			rr_subvolume_t *subvolume = NULL;
+	  
+			subvolume = &rr->subvolume_list[i];
+	  
+			subvolume->xl = children->xlator;
+			subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED;
+			subvolume->status = RR_SUBVOLUME_ONLINE;
+	  
+			i++;
+		}
+	}
+  
+	rr->schedule_index = UINT64_MAX;
+	rr->last_stat_fetched_time.tv_sec = 0;
+	rr->last_stat_fetched_time.tv_usec = 0;
+	pthread_mutex_init (&rr->mutex, NULL);
+  
+	*((long *)this_xl->private) = (long)rr;
+  
+	return 0;
+}
+
+void 
+rr_fini (xlator_t *this_xl)
+{
+	rr_t *rr = NULL;
+  
+	if (this_xl == NULL)
+	{
+		return;
+	}
+  
+	if ((rr = (rr_t *) *((long *)this_xl->private)) != NULL)
+	{
+		pthread_mutex_destroy (&rr->mutex);
+		_cleanup_rr (rr);
+		this_xl->private = NULL;
+	}
+  
+	return;
+}
+
+xlator_t *
+rr_schedule (xlator_t *this_xl, const void *path)
+{
+	rr_t *rr = NULL;
+	uint64_t next_schedule_index = 0;
+	int i = 0;
+  
+	if (this_xl == NULL || path == NULL)
+	{
+		return NULL;
+	}
+  
+	rr = (rr_t *) *((long *)this_xl->private);
+	next_schedule_index = ROUND_ROBIN (rr->schedule_index, 
+					   rr->subvolume_count);
+  
+	rr_update (this_xl);
+  
+	for (i = next_schedule_index; i < rr->subvolume_count; i++)
+	{
+		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && 
+		    rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED)
+		{
+			pthread_mutex_lock (&rr->mutex);
+			rr->schedule_index = i;
+			pthread_mutex_unlock (&rr->mutex);
+			return rr->subvolume_list[i].xl;
+		}
+	}
+  
+	for (i = 0; i < next_schedule_index; i++)
+	{
+		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && 
+		    rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED)
+		{
+			pthread_mutex_lock (&rr->mutex);
+			rr->schedule_index = i;
+			pthread_mutex_unlock (&rr->mutex);
+			return rr->subvolume_list[i].xl;
+		}
+	}
+  
+	for (i = next_schedule_index; i < rr->subvolume_count; i++)
+	{
+		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE)
+		{
+			pthread_mutex_lock (&rr->mutex);
+			rr->schedule_index = i;
+			pthread_mutex_unlock (&rr->mutex);
+			return rr->subvolume_list[i].xl;
+		}
+	}
+  
+	for (i = 0; i < next_schedule_index; i++)
+	{
+		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE)
+		{
+			pthread_mutex_lock (&rr->mutex);
+			rr->schedule_index = i;
+			pthread_mutex_unlock (&rr->mutex);
+			return rr->subvolume_list[i].xl;
+		}
+	}
+  
+	return NULL;
+}
+
+void
+rr_update (xlator_t *this_xl)
+{
+	rr_t *rr = NULL;
+	struct timeval ctime = {0, 0};
+	int i = 0;
+  
+	if (this_xl == NULL)
+	{
+		return ;
+	}
+  
+	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+	{
+		return ;
+	}
+  
+	if (gettimeofday (&ctime, NULL) != 0)
+	{
+		return ;
+	}
+  
+	if (ctime.tv_sec > (rr->options.refresh_interval + 
+			    rr->last_stat_fetched_time.tv_sec))
+	{
+		pthread_mutex_lock (&rr->mutex);
+		rr->last_stat_fetched_time = ctime;
+		pthread_mutex_unlock (&rr->mutex);
+      
+		for (i = 0; i < rr->subvolume_count; i++)
+		{
+			xlator_t *subvolume_xl = NULL;
+			call_frame_t *frame = NULL;
+			call_pool_t *pool = NULL;
+	  
+			subvolume_xl = rr->subvolume_list[i].xl;
+	  
+			pool = this_xl->ctx->pool;
+	  
+			frame = create_frame (this_xl, pool);
+
+			STACK_WIND_COOKIE (frame,
+					   rr_update_cbk, 
+					   subvolume_xl->name, 
+					   subvolume_xl, 
+					   subvolume_xl->mops->stats, 
+					   0);
+		}
+	}
+  
+	return ;
+}
+
+int 
+rr_update_cbk (call_frame_t *frame, 
+	       void *cookie, 
+	       xlator_t *this_xl, 
+	       int32_t op_ret, 
+	       int32_t op_errno, 
+	       struct xlator_stats *stats)
+{
+	rr_t *rr = NULL;
+	rr_subvolume_t *subvolume = NULL;
+	uint8_t free_disk_percent = 0;
+	int i = 0;
+  
+	if (frame == NULL)
+	{
+		return -1;
+	}
+  
+	if (cookie == NULL || this_xl == NULL)
+	{
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	if (op_ret == 0 && stats == NULL)
+	{
+		LOG_CRITICAL ("fatal! op_ret is 0 and stats is NULL.  "
+			      "Please report this to <gluster-devel@nongnu.org>");
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+	{
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	for (i = 0; i < rr->subvolume_count; i++)
+	{
+		if (rr->subvolume_list[i].xl->name == (char *) cookie)
+		{
+			subvolume = &rr->subvolume_list[i];
+			break;
+		}
+	}
+  
+	if (subvolume == NULL)
+	{
+		LOG_ERROR ("unknown cookie [%s]", (char *) cookie);
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	if (op_ret == 0)
+	{
+		free_disk_percent = (stats->free_disk * 100) / stats->total_disk_size;
+		if (free_disk_percent > rr->options.min_free_disk)
+		{
+			if (subvolume->free_disk_status != RR_MIN_FREE_DISK_NOT_REACHED)
+			{
+				pthread_mutex_lock (&rr->mutex);
+				subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED;
+				pthread_mutex_unlock (&rr->mutex);
+				LOG_WARNING ("subvolume [%s] is available with free space for scheduling", 
+					     subvolume->xl->name);
+			}
+		}
+		else
+		{
+			if (subvolume->free_disk_status != RR_MIN_FREE_DISK_REACHED)
+			{
+				pthread_mutex_lock (&rr->mutex);
+				subvolume->free_disk_status = RR_MIN_FREE_DISK_REACHED;
+				pthread_mutex_unlock (&rr->mutex);
+				LOG_WARNING ("subvolume [%s] reached minimum disk space requirement", 
+					     subvolume->xl->name);
+			}
+		}
+	}
+	else 
+	{
+		pthread_mutex_lock (&rr->mutex);
+		subvolume->status = RR_SUBVOLUME_OFFLINE;
+		pthread_mutex_unlock (&rr->mutex);
+		LOG_ERROR ("unable to get subvolume [%s] status information and "
+			   "scheduling is disabled", 
+			   subvolume->xl->name);
+	}
+  
+	STACK_DESTROY (frame->root);
+	return 0;
+}
+
+void
+rr_notify (xlator_t *this_xl, int32_t event, void *data)
+{
+	rr_t *rr = NULL;
+	rr_subvolume_t *subvolume = NULL;
+	xlator_t *subvolume_xl = NULL;
+	int i = 0, ret = 0;
+	call_frame_t *frame = NULL;
+	call_pool_t *pool = NULL;
+	dict_t *xattr = get_new_dict ();
+	int32_t version[1] = {1};
+
+	if (this_xl == NULL || data == NULL) {
+		return ;
+	}
+  
+	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) {
+		return ;
+	}
+  
+	subvolume_xl = (xlator_t *) data;
+  
+	for (i = 0; i < rr->subvolume_count; i++) {
+		if (rr->subvolume_list[i].xl == subvolume_xl) {
+			subvolume = &rr->subvolume_list[i];
+			break;
+		}
+	}
+  
+	switch (event) {
+	case GF_EVENT_CHILD_UP:
+		/* Seeding, to be done only once */
+		if (rr->first_time && (i == rr->subvolume_count)) {
+			loc_t loc = {0,};
+			xlator_t *trav = NULL;
+
+			pool = this_xl->ctx->pool;
+			frame = create_frame (this_xl, pool);
+			ret = dict_set_bin (xattr, "trusted.glusterfs.scheduler.rr",
+					    version, sizeof (int32_t));
+			if (-1 == ret) {
+				gf_log (this_xl->name, GF_LOG_ERROR, "rr seed setting failed");
+			}
+			if (xattr)
+				dict_ref (xattr);
+			
+			loc.path = strdup ("/");
+			for (trav = this_xl->parents->xlator; trav; trav = trav->parents->xlator) {
+				if (trav->itable) {
+					loc.inode = trav->itable->root;
+					break;
+				}
+			}
+			STACK_WIND (frame,
+				    rr_notify_cbk,
+				    (xlator_t *)data,
+				    ((xlator_t *)data)->fops->xattrop,
+				    &loc,
+				    GF_XATTROP_ADD_ARRAY,
+				    xattr);
+	  
+			if (xattr)
+				dict_unref (xattr);
+
+			rr->first_time = 0;
+		}
+		if (subvolume) {
+			pthread_mutex_lock (&rr->mutex);
+			subvolume->status = RR_SUBVOLUME_ONLINE;
+			pthread_mutex_unlock (&rr->mutex);
+		}
+		break;
+	case GF_EVENT_CHILD_DOWN:
+		if (subvolume) {
+			pthread_mutex_lock (&rr->mutex);
+			subvolume->status = RR_SUBVOLUME_OFFLINE;
+			pthread_mutex_unlock (&rr->mutex);
+		}
+		break;
+	}
+  
+	return ;
+}
+
+int 
+rr_notify_cbk (call_frame_t *frame, 
+	       void *cookie, 
+	       xlator_t *this_xl, 
+	       int32_t op_ret, 
+	       int32_t op_errno,
+	       dict_t *xattr)
+{
+	rr_t *rr = NULL;
+	int32_t *index = NULL;
+	int32_t ret = -1;
+	void *tmp_index_ptr = NULL;
+
+	if (frame == NULL) 
+	{
+		return -1;
+	}
+  
+	if ((this_xl == NULL) || (op_ret == -1))
+	{
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL)
+	{
+		STACK_DESTROY (frame->root);
+		return -1;
+	}
+  
+	ret = dict_get_bin (xattr, "trusted.glusterfs.scheduler.rr", &tmp_index_ptr);
+	index = tmp_index_ptr;
+	if (ret == 0)
+		rr->schedule_index = (index[0] % rr->subvolume_count);
+	else
+		rr->schedule_index = 0;
+
+	STACK_DESTROY (frame->root);
+	return 0;
+}
+
+struct sched_ops sched = {
+	.init     = rr_init,
+	.fini     = rr_fini,
+	.update   = rr_update,
+	.schedule = rr_schedule,
+	.notify   = rr_notify
+};
+
diff --git a/scheduler/rr/src/rr.h b/scheduler/rr/src/rr.h
new file mode 100644
index 000000000..baa471209
--- /dev/null
+++ b/scheduler/rr/src/rr.h
@@ -0,0 +1,70 @@
+/*
+   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+   This file is part of GlusterFS.
+
+   GlusterFS is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3 of the License,
+   or (at your option) any later version.
+
+   GlusterFS is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RR_H
+#define _RR_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "scheduler.h"
+#include <stdint.h>
+#include <sys/time.h>
+
+struct rr_subvolume
+{
+  xlator_t  *xl;
+  uint8_t   free_disk_status;
+  uint8_t   status;
+};
+typedef struct rr_subvolume rr_subvolume_t;
+
+struct rr
+{
+  rr_options_t    options;
+  rr_subvolume_t  *subvolume_list;
+  uint64_t        subvolume_count;
+  uint64_t        schedule_index;
+  struct timeval  last_stat_fetched_time;
+  pthread_mutex_t mutex;
+  char            first_time;
+};
+typedef struct rr rr_t;
+
+int rr_init (xlator_t *this_xl);
+void rr_fini (xlator_t *this_xl);
+xlator_t *rr_schedule (xlator_t *this_xl, const void *path);
+void rr_update (xlator_t *this_xl);
+int rr_update_cbk (call_frame_t *frame, 
+		   void *cookie, 
+		   xlator_t *this_xl, 
+		   int32_t op_ret, 
+		   int32_t op_errno, 
+		   struct xlator_stats *stats);
+void rr_notify (xlator_t *this_xl, int32_t event, void *data);
+int rr_notify_cbk (call_frame_t *frame, 
+		   void *cookie, 
+		   xlator_t *this_xl, 
+		   int32_t op_ret, 
+		   int32_t op_errno,
+		   dict_t *xattr);
+
+#endif /* _RR_H */
diff --git a/scheduler/switch/Makefile.am b/scheduler/switch/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/scheduler/switch/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES = 
diff --git a/scheduler/switch/src/Makefile.am b/scheduler/switch/src/Makefile.am
new file mode 100644
index 000000000..dc7d16d40
--- /dev/null
+++ b/scheduler/switch/src/Makefile.am
@@ -0,0 +1,12 @@
+sched_LTLIBRARIES = switch.la
+scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler
+
+switch_la_LDFLAGS = -module -avoidversion
+
+switch_la_SOURCES = switch.c
+switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
+	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/scheduler/switch/src/switch.c b/scheduler/switch/src/switch.c
new file mode 100644
index 000000000..70b307187
--- /dev/null
+++ b/scheduler/switch/src/switch.c
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+  This file is part of GlusterFS.
+
+  GlusterFS is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published
+  by the Free Software Foundation; either version 3 of the License,
+  or (at your option) any later version.
+
+  GlusterFS is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see
+  <http://www.gnu.org/licenses/>.
+*/
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <fnmatch.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "scheduler.h"
+
+struct switch_sched_array {
+	xlator_t *xl;
+	int32_t   eligible;
+	int32_t   considered;
+};
+
+/* Select one of this struct based on the path's pattern match */
+struct switch_sched_struct {
+	struct switch_sched_struct *next;
+	struct switch_sched_array  *array;
+	char                        path_pattern[256];
+	int32_t                     node_index; /* Index of the node in 
+						   this pattern. */
+	int32_t                     num_child;  /* Total num of child nodes 
+						   with this pattern. */
+};
+
+struct switch_struct {
+	struct switch_sched_struct *cond;
+	struct switch_sched_array  *array;
+	pthread_mutex_t             switch_mutex;
+	int32_t                     child_count;
+};
+
+/* This function should return child node as '*:subvolumes' is inserterd */
+static xlator_t *
+switch_get_matching_xl (const char *path, struct switch_sched_struct *cond)
+{
+	struct switch_sched_struct *trav      = cond;
+	char                       *pathname  = strdup (path);
+	int                         index     = 0;
+
+	while (trav) {
+		if (fnmatch (trav->path_pattern, 
+			     pathname, FNM_NOESCAPE) == 0) {
+			free (pathname);
+			trav->node_index %= trav->num_child;
+			index = (trav->node_index++) % trav->num_child;
+			return trav->array[index].xl;
+		}
+		trav = trav->next;
+	}
+	free (pathname);
+	return NULL;
+}
+
+
+static int32_t
+switch_init (xlator_t *xl)
+{
+	int32_t index = 0;
+	data_t *data = NULL;
+	char *child = NULL;
+	char *tmp = NULL;
+	char *childs_data = NULL;
+	xlator_list_t *trav_xl = xl->children;
+	struct switch_struct *switch_buf = NULL;
+  
+	switch_buf = CALLOC (1, sizeof (struct switch_struct));
+	ERR_ABORT (switch_buf);
+
+	while (trav_xl) {
+		index++;
+		trav_xl = trav_xl->next;
+	}
+	switch_buf->child_count = index;
+	switch_buf->array = CALLOC (index + 1, 
+				    sizeof (struct switch_sched_struct));
+	ERR_ABORT (switch_buf->array);
+	trav_xl = xl->children;
+	index = 0;
+
+	while (trav_xl) {
+		switch_buf->array[index].xl = trav_xl->xlator;
+		switch_buf->array[index].eligible = 1;
+		trav_xl = trav_xl->next;
+		index++;
+	}
+
+	data = dict_get (xl->options, "scheduler.read-only-subvolumes");
+	if (data) {
+		childs_data = strdup (data->data);
+		child = strtok_r (childs_data, ",", &tmp);
+		while (child) {
+			for (index = 1; 
+			     index < switch_buf->child_count; index++) {
+				if (strcmp (switch_buf->array[index - 1].xl->name, child) == 0) {
+					gf_log ("switch", GF_LOG_DEBUG, 
+						"Child '%s' is read-only", 
+						child);
+					memcpy (&(switch_buf->array[index-1]),
+						&(switch_buf->array[switch_buf->child_count - 1]), 
+						sizeof (struct switch_sched_array));
+					switch_buf->child_count--;
+					break;
+				}
+			}
+			child = strtok_r (NULL, ",", &tmp);
+		}
+		free (childs_data);
+	}
+
+	data = dict_get (xl->options, "scheduler.local-volume-name");
+	if (data) {
+		/* Means, give preference to that node first */
+		gf_log ("switch", GF_LOG_DEBUG, 
+			"local volume defined as %s", data->data);
+
+		/* TODO: parse it properly, have an extra index to 
+		   specify that first */
+	}
+
+	/*  *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */
+	data = dict_get (xl->options, "scheduler.switch.case");
+	if (data) {
+		char *tmp_str = NULL; 
+		char *tmp_str1 = NULL;
+		char *dup_str = NULL;
+		char *switch_str = NULL;
+		char *pattern = NULL;
+		char *childs = NULL;
+		struct switch_sched_struct *switch_opt = NULL;
+		struct switch_sched_struct *trav = NULL;
+		/* Get the pattern for considering switch case. 
+		   "option block-size *avi:10MB" etc */
+		switch_str = strtok_r (data->data, ";", &tmp_str);
+		while (switch_str) {
+			dup_str = strdup (switch_str);
+			switch_opt = 
+				CALLOC (1, 
+					sizeof (struct switch_sched_struct));
+			ERR_ABORT (switch_opt);
+
+			/* Link it to the main structure */
+			if (switch_buf->cond) {
+				/* there are already few entries */
+				trav = switch_buf->cond;
+				while (trav->next)
+					trav = trav->next;
+				trav->next = switch_opt;
+			} else {
+				/* First entry */
+				switch_buf->cond = switch_opt;
+			}
+			pattern = strtok_r (dup_str, ":", &tmp_str1);
+			childs = strtok_r (NULL, ":", &tmp_str1);
+			if (strncmp (pattern, "*", 2) == 0) {
+				gf_log ("switch", GF_LOG_WARNING,
+					"'*' pattern will be taken by default "
+					"for all the unconfigured child nodes,"
+					" hence neglecting current option");
+				switch_str = strtok_r (NULL, ";", &tmp_str);
+				free (dup_str);
+				continue;
+			}
+			memcpy (switch_opt->path_pattern, 
+				pattern, strlen (pattern));
+			if (childs) {
+				int32_t idx = 0;
+				char *tmp1 = NULL;
+				char *dup_childs = NULL;
+				/* TODO: get the list of child nodes for 
+				   the given pattern */
+				dup_childs = strdup (childs);
+				child = strtok_r (dup_childs, ",", &tmp);
+				while (child) {
+					idx++;
+					child = strtok_r (NULL, ",", &tmp);
+				}
+				free (dup_childs);
+				child = strtok_r (childs, ",", &tmp1);
+				switch_opt->num_child = idx;
+				switch_opt->array = 
+					CALLOC (1, idx * sizeof (struct switch_sched_array));
+				ERR_ABORT (switch_opt->array);
+				idx = 0;
+				child = strtok_r (childs, ",", &tmp);
+				while (child) {
+					for (index = 1; 
+					     index < switch_buf->child_count; 
+					     index++) {
+						if (strcmp (switch_buf->array[index - 1].xl->name, 
+							    child) == 0) {
+							gf_log ("switch", 
+								GF_LOG_DEBUG,
+								"'%s' pattern will be scheduled to \"%s\"",
+								switch_opt->path_pattern, child);
+							/*
+							  if (switch_buf->array[index-1].considered) {
+							  gf_log ("switch", GF_LOG_DEBUG, 
+							  "ambiguity found, exiting");
+							  return -1;
+							  }
+							*/
+							switch_opt->array[idx].xl = switch_buf->array[index-1].xl;
+							switch_buf->array[index-1].considered = 1;
+							idx++;
+							break;
+						}
+					}
+					child = strtok_r (NULL, ",", &tmp1);
+				}
+			} else {
+				/* error */
+				gf_log ("switch", GF_LOG_ERROR, 
+					"Check \"scheduler.switch.case\" "
+					"option in unify volume. Exiting");
+				free (switch_buf->array);
+				free (switch_buf);
+				return -1;
+			}
+			free (dup_str);
+			switch_str = strtok_r (NULL, ";", &tmp_str);
+		}
+	}
+	/* Now, all the pattern based considerations done, so for all the 
+	 * remaining pattern, '*' to all the remaining child nodes
+	 */
+	{
+		struct switch_sched_struct *switch_opt = NULL;
+		int32_t flag = 0;
+		int32_t index = 0;
+		for (index=0; index < switch_buf->child_count; index++) {
+			/* check for considered flag */
+			if (switch_buf->array[index].considered)
+				continue;
+			flag++;
+		}
+		if (!flag) {
+			gf_log ("switch", GF_LOG_ERROR,
+				"No nodes left for pattern '*'. Exiting.");
+			return -1;
+		}
+		switch_opt = CALLOC (1, sizeof (struct switch_sched_struct));
+		ERR_ABORT (switch_opt);
+		if (switch_buf->cond) {
+			/* there are already few entries */
+			struct switch_sched_struct *trav = switch_buf->cond;
+			while (trav->next)
+				trav = trav->next;
+			trav->next = switch_opt;
+		} else {
+			/* First entry */
+			switch_buf->cond = switch_opt;
+		}
+		/* Add the '*' pattern to the array */
+		memcpy (switch_opt->path_pattern, "*", 2);
+		switch_opt->num_child = flag;
+		switch_opt->array = 
+			CALLOC (1, flag * sizeof (struct switch_sched_array));
+		ERR_ABORT (switch_opt->array);
+		flag = 0;
+		for (index=0; index < switch_buf->child_count; index++) {
+			/* check for considered flag */
+			if (switch_buf->array[index].considered)
+				continue;
+			gf_log ("switch", GF_LOG_DEBUG,
+				"'%s' pattern will be scheduled to \"%s\"",
+				switch_opt->path_pattern, 
+				switch_buf->array[index].xl->name);
+			switch_opt->array[flag].xl = 
+				switch_buf->array[index].xl;
+			switch_buf->array[index].considered = 1;
+			flag++;
+		}
+	}
+
+	pthread_mutex_init (&switch_buf->switch_mutex, NULL);
+
+	// put it at the proper place
+	*((long *)xl->private) = (long)switch_buf; 
+
+	return 0;
+}
+
+static void
+switch_fini (xlator_t *xl)
+{
+	/* TODO: free all the allocated entries */
+	struct switch_struct *switch_buf = NULL;
+	switch_buf = (struct switch_struct *)*((long *)xl->private);
+
+	pthread_mutex_destroy (&switch_buf->switch_mutex);
+	free (switch_buf->array);
+	free (switch_buf);
+}
+
+static xlator_t *
+switch_schedule (xlator_t *xl, const void *path)
+{
+	struct switch_struct *switch_buf = NULL;
+	switch_buf = (struct switch_struct *)*((long *)xl->private);
+
+	return switch_get_matching_xl (path, switch_buf->cond);
+}
+
+
+/**
+ * notify
+ */
+void
+switch_notify (xlator_t *xl, int32_t event, void *data)
+{
+	/* TODO: This should be checking in switch_sched_struct */
+#if 0
+	struct switch_struct *switch_buf = NULL;
+	int32_t idx = 0;
+
+	switch_buf = (struct switch_struct *)*((long *)xl->private);
+	if (!switch_buf)
+		return;
+
+	for (idx = 0; idx < switch_buf->child_count; idx++) {
+		if (switch_buf->array[idx].xl == (xlator_t *)data)
+			break;
+	}
+
+	switch (event)
+	{
+	case GF_EVENT_CHILD_UP:
+	{
+		switch_buf->array[idx].eligible = 1;
+	}
+	break;
+	case GF_EVENT_CHILD_DOWN:
+	{
+		switch_buf->array[idx].eligible = 0;
+	}
+	break;
+	default:
+	{
+		;
+	}
+	break;
+	}
+#endif
+}
+
+static void 
+switch_update (xlator_t *xl)
+{
+	return;
+}
+
+struct sched_ops sched = {
+	.init     = switch_init,
+	.fini     = switch_fini,
+	.update   = switch_update,
+	.schedule = switch_schedule,
+	.notify   = switch_notify
+};
+
+struct volume_options options[] = {
+	{ .key   = { "scheduler.read-only-subvolumes" ,
+		     "switch.read-only-subvolumes"},  
+	  .type  = GF_OPTION_TYPE_ANY
+	},
+	{ .key   = { "scheduler.local-volume-name",
+		     "switch.nufa.local-volume-name" },
+	  .type  = GF_OPTION_TYPE_XLATOR
+	},
+	{ .key   = { "scheduler.switch.case",
+		     "switch.case" },
+	  .type  = GF_OPTION_TYPE_ANY
+	},
+	{ .key = {NULL} }
+};
author	Vikas Gorur <vikas@zresearch.com>	2009-02-18 17:36:07 +0530
committer	Vikas Gorur <vikas@zresearch.com>	2009-02-18 17:36:07 +0530
commit	77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree	02e155a5753b398ee572b45793f889b538efab6b /scheduler
parent	f3b2e6580e5663292ee113c741343c8a43ee133f (diff)