diff options
Diffstat (limited to 'scheduler')
| -rw-r--r-- | scheduler/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/alu/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/alu/src/Makefile.am | 14 | ||||
| -rw-r--r-- | scheduler/alu/src/alu.c | 993 | ||||
| -rw-r--r-- | scheduler/alu/src/alu.h | 89 | ||||
| -rw-r--r-- | scheduler/nufa/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/nufa/src/Makefile.am | 12 | ||||
| -rw-r--r-- | scheduler/nufa/src/nufa.c | 403 | ||||
| -rw-r--r-- | scheduler/random/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/random/src/Makefile.am | 14 | ||||
| -rw-r--r-- | scheduler/random/src/random.c | 283 | ||||
| -rw-r--r-- | scheduler/random/src/random.h | 46 | ||||
| -rw-r--r-- | scheduler/rr/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/rr/src/Makefile.am | 13 | ||||
| -rw-r--r-- | scheduler/rr/src/rr-options.c | 256 | ||||
| -rw-r--r-- | scheduler/rr/src/rr-options.h | 34 | ||||
| -rw-r--r-- | scheduler/rr/src/rr.c | 565 | ||||
| -rw-r--r-- | scheduler/rr/src/rr.h | 70 | ||||
| -rw-r--r-- | scheduler/switch/Makefile.am | 3 | ||||
| -rw-r--r-- | scheduler/switch/src/Makefile.am | 12 | ||||
| -rw-r--r-- | scheduler/switch/src/switch.c | 398 | 
21 files changed, 3220 insertions, 0 deletions
diff --git a/scheduler/Makefile.am b/scheduler/Makefile.am new file mode 100644 index 00000000000..618fa7dd8b8 --- /dev/null +++ b/scheduler/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = alu random nufa rr switch + +CLEANFILES =  diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/alu/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am new file mode 100644 index 00000000000..eb7d0db07e0 --- /dev/null +++ b/scheduler/alu/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = alu.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +alu_la_LDFLAGS = -module -avoidversion + +alu_la_SOURCES = alu.c +alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = alu.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c new file mode 100644 index 00000000000..754c5e35352 --- /dev/null +++ b/scheduler/alu/src/alu.c @@ -0,0 +1,993 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + + + +/* ALU code needs a complete re-write. This is one of the most important  + * part of GlusterFS and so needs more and more reviews and testing  + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> +#include <stdint.h> +#include "stack.h" +#include "alu.h" + +#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT         (1 * GF_UNIT_GB) +#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT          (512 * GF_UNIT_MB) + +#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT        25 +#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT         5 + +#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT         25 +#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT          5 + +#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT   1000 +#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT    100 + +#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT             100 + +#define ALU_REFRESH_INTERVAL_DEFAULT                   5 +#define ALU_REFRESH_CREATE_COUNT_DEFAULT               5 + + +static int64_t  +get_stats_disk_usage (struct xlator_stats *this) +{ +	return this->disk_usage; +} + +static int64_t  +get_stats_write_usage (struct xlator_stats *this) +{ +	return this->write_usage; +} + +static int64_t  +get_stats_read_usage (struct xlator_stats *this) +{ +	return this->read_usage; +} + +static int64_t  +get_stats_disk_speed (struct xlator_stats *this) +{ +	return this->disk_speed; +} + +static int64_t  +get_stats_file_usage (struct xlator_stats *this) +{ +	/* Avoid warning "defined but not used" */ +	(void) &get_stats_file_usage;     + +	return this->nr_files; +} + +static int64_t  +get_stats_free_disk (struct xlator_stats *this) +{ +	if (this->total_disk_size > 0) +		return (this->free_disk * 100) / this->total_disk_size; +	return 0; +} + +static int64_t  +get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min) +{ +	return (max->write_usage - min->write_usage); +} + +static int64_t  +get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min) +{ +	return (max->read_usage - min->read_usage); +} + +static int64_t  +get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min) +{ +	return (max->disk_usage - min->disk_usage); +} + +static int64_t  +get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min) +{ +	return (max->disk_speed - min->disk_speed); +} + +static int64_t  +get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min) +{ +	return (max->nr_files - min->nr_files); +} + + +int  +alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched) +{ +	data_t *order = dict_get (xl->options, "scheduler.alu.order"); +	if (!order) { +		gf_log (xl->name, GF_LOG_ERROR, +			"option 'scheduler.alu.order' not specified"); +		return -1; +	} +	struct alu_threshold *_threshold_fn; +	struct alu_threshold *tmp_threshold; +	data_t *entry_fn = NULL; +	data_t *exit_fn = NULL; +	char *tmp_str = NULL; +	char *order_str = strtok_r (order->data, ":", &tmp_str); +	/* Get the scheduling priority order, specified by the user. */ +	while (order_str) { +		gf_log ("alu", GF_LOG_DEBUG, +			"alu_init: order string: %s", +			order_str); +		if (strcmp (order_str, "disk-usage") == 0) { +			/* Disk usage */ +			_threshold_fn =  +				CALLOC (1, sizeof (struct alu_threshold)); +			ERR_ABORT (_threshold_fn); +			_threshold_fn->diff_value = get_max_diff_disk_usage; +			_threshold_fn->sched_value = get_stats_disk_usage; +			entry_fn =  +				dict_get (xl->options,  +					  "scheduler.alu.disk-usage.entry-threshold"); +			if (entry_fn) { +				if (gf_string2bytesize (entry_fn->data,  +							&alu_sched->entry_limit.disk_usage) != 0) { +					gf_log (xl->name, GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"disk-usage.entry-threshold\"", +						entry_fn->data); +					return -1; +				} +			} else { +				alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT; +			} +			_threshold_fn->entry_value = get_stats_disk_usage; +			exit_fn = dict_get (xl->options,  +					    "scheduler.alu.disk-usage.exit-threshold"); +			if (exit_fn) { +				if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0)	{ +					gf_log (xl->name, GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"disk-usage.exit-threshold\"",  +						exit_fn->data); +					return -1; +				} +			} else { +				alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT; +			} +			_threshold_fn->exit_value = get_stats_disk_usage; +			tmp_threshold = alu_sched->threshold_fn; +			if (!tmp_threshold) { +				alu_sched->threshold_fn = _threshold_fn; +			} else { +				while (tmp_threshold->next) { +					tmp_threshold = tmp_threshold->next; +				} +				tmp_threshold->next = _threshold_fn; +			} +			gf_log ("alu", +				GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"",  +				alu_sched->entry_limit.disk_usage,  +				alu_sched->exit_limit.disk_usage); +			 +		} else if (strcmp (order_str, "write-usage") == 0) { +			/* Handle "write-usage" */ +			 +			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); +			ERR_ABORT (_threshold_fn); +			_threshold_fn->diff_value = get_max_diff_write_usage; +			_threshold_fn->sched_value = get_stats_write_usage; +			entry_fn = dict_get (xl->options,  +					     "scheduler.alu.write-usage.entry-threshold"); +			if (entry_fn) { +				if (gf_string2bytesize (entry_fn->data,  +							&alu_sched->entry_limit.write_usage) != 0) { +					gf_log (xl->name, GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of option scheduler.alu." +						"write-usage.entry-threshold",  +						entry_fn->data); +					return -1; +				} +			} else { +				alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT; +			} +			_threshold_fn->entry_value = get_stats_write_usage; +			exit_fn = dict_get (xl->options,  +					    "scheduler.alu.write-usage.exit-threshold"); +			if (exit_fn) { +				if (gf_string2bytesize (exit_fn->data,  +							&alu_sched->exit_limit.write_usage) != 0) { +					gf_log (xl->name, GF_LOG_ERROR,  +						"invalid number format \"%s\"" +						" of \"option scheduler.alu." +						"write-usage.exit-threshold\"", +						exit_fn->data); +					return -1; +				} +			} else { +				alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT; +			} +			_threshold_fn->exit_value = get_stats_write_usage; +			tmp_threshold = alu_sched->threshold_fn; +			if (!tmp_threshold) { +				alu_sched->threshold_fn = _threshold_fn; +			} else { +				while (tmp_threshold->next) { +					tmp_threshold = tmp_threshold->next; +				} +				tmp_threshold->next = _threshold_fn; +			} +			gf_log (xl->name, GF_LOG_DEBUG,  +				"alu_init: = %"PRId64",%"PRId64"",  +				alu_sched->entry_limit.write_usage,  +				alu_sched->exit_limit.write_usage); +			 +		} else if (strcmp (order_str, "read-usage") == 0) { +			/* Read usage */ +			 +			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); +			ERR_ABORT (_threshold_fn); +			_threshold_fn->diff_value = get_max_diff_read_usage; +			_threshold_fn->sched_value = get_stats_read_usage; +			entry_fn = dict_get (xl->options,  +					     "scheduler.alu.read-usage.entry-threshold"); +			if (entry_fn) { +				if (gf_string2bytesize (entry_fn->data,  +							&alu_sched->entry_limit.read_usage) != 0) { +					gf_log (xl->name,  +						GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"read-usage.entry-threshold\"", +						entry_fn->data); +					return -1; +				} +			} else { +				alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT; +			} +			_threshold_fn->entry_value = get_stats_read_usage; +			exit_fn = dict_get (xl->options,  +					    "scheduler.alu.read-usage.exit-threshold"); +			if (exit_fn) +			{ +				if (gf_string2bytesize (exit_fn->data,  +							&alu_sched->exit_limit.read_usage) != 0) +				{ +					gf_log ("alu", GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"read-usage.exit-threshold\"",  +						exit_fn->data); +					return -1; +				} +			} +			else +			{ +				alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT; +			} +			_threshold_fn->exit_value = get_stats_read_usage; +			tmp_threshold = alu_sched->threshold_fn; +			if (!tmp_threshold) { +				alu_sched->threshold_fn = _threshold_fn; +			} +			else { +				while (tmp_threshold->next) { +					tmp_threshold = tmp_threshold->next; +				} +				tmp_threshold->next = _threshold_fn; +			} +			gf_log ("alu", GF_LOG_DEBUG,  +				"alu_init: = %"PRId64",%"PRId64"",  +				alu_sched->entry_limit.read_usage,  +				alu_sched->exit_limit.read_usage); +			 +		} else if (strcmp (order_str, "open-files-usage") == 0) { +			/* Open files counter */ +			 +			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); +			ERR_ABORT (_threshold_fn); +			_threshold_fn->diff_value = get_max_diff_file_usage; +			_threshold_fn->sched_value = get_stats_file_usage; +			entry_fn = dict_get (xl->options,  +					     "scheduler.alu.open-files-usage.entry-threshold"); +			if (entry_fn) { +				if (gf_string2uint64 (entry_fn->data,  +						      &alu_sched->entry_limit.nr_files) != 0) +				{ +					gf_log ("alu", GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"open-files-usage.entry-" +						"threshold\"", entry_fn->data); +					return -1; +				} +			} +			else +			{ +				alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT; +			} +			_threshold_fn->entry_value = get_stats_file_usage; +			exit_fn = dict_get (xl->options,  +					    "scheduler.alu.open-files-usage.exit-threshold"); +			if (exit_fn) +			{ +				if (gf_string2uint64 (exit_fn->data,  +						      &alu_sched->exit_limit.nr_files) != 0) +				{ +					gf_log ("alu", GF_LOG_ERROR,  +						"invalid number format \"%s\" " +						"of \"option scheduler.alu." +						"open-files-usage.exit-" +						"threshold\"", exit_fn->data); +					return -1; +				} +			} +			else +			{ +				alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT; +			} +			_threshold_fn->exit_value = get_stats_file_usage; +			tmp_threshold = alu_sched->threshold_fn; +			if (!tmp_threshold) { +				alu_sched->threshold_fn = _threshold_fn; +			} +			else { +				while (tmp_threshold->next) { +					tmp_threshold = tmp_threshold->next; +				} +				tmp_threshold->next = _threshold_fn; +			} +			gf_log ("alu", GF_LOG_DEBUG,  +				"alu.c->alu_init: = %"PRIu64",%"PRIu64"",  +				alu_sched->entry_limit.nr_files,  +				alu_sched->exit_limit.nr_files); +			 +		} else if (strcmp (order_str, "disk-speed-usage") == 0) { +			/* Disk speed */ +			 +			_threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); +			ERR_ABORT (_threshold_fn); +			_threshold_fn->diff_value = get_max_diff_disk_speed; +			_threshold_fn->sched_value = get_stats_disk_speed; +			entry_fn = dict_get (xl->options,  +					     "scheduler.alu.disk-speed-usage.entry-threshold"); +			if (entry_fn) { +				gf_log ("alu", GF_LOG_DEBUG, +					"entry-threshold is given, " +					"value is constant"); +			} +			_threshold_fn->entry_value = NULL; +			exit_fn = dict_get (xl->options,  +					    "scheduler.alu.disk-speed-usage.exit-threshold"); +			if (exit_fn) { +				gf_log ("alu", GF_LOG_DEBUG, +					"exit-threshold is given, " +					"value is constant"); +			} +			_threshold_fn->exit_value = NULL; +			tmp_threshold = alu_sched->threshold_fn; +			if (!tmp_threshold) { +				alu_sched->threshold_fn = _threshold_fn; +			} +			else { +				while (tmp_threshold->next) { +					tmp_threshold = tmp_threshold->next; +				} +				tmp_threshold->next = _threshold_fn; +			} +			 +		} else { +			gf_log ("alu", GF_LOG_DEBUG, +				"%s, unknown option provided to scheduler", +				order_str); +		} +		order_str = strtok_r (NULL, ":", &tmp_str); +	} +	 +	return 0; +} + +static int32_t +alu_init (xlator_t *xl) +{ +	struct alu_sched *alu_sched = NULL; +	struct alu_limits *_limit_fn = NULL; +	struct alu_limits *tmp_limits = NULL; +	uint32_t min_free_disk = 0; +	data_t *limits = NULL; +   +	alu_sched = CALLOC (1, sizeof (struct alu_sched)); +	ERR_ABORT (alu_sched); + +	{ +		alu_parse_options (xl, alu_sched); +	} + +	/* Get the limits */ +	 +	limits = dict_get (xl->options,  +			   "scheduler.limits.min-free-disk"); +	if (limits) { +		_limit_fn = CALLOC (1, sizeof (struct alu_limits)); +		ERR_ABORT (_limit_fn); +		_limit_fn->min_value = get_stats_free_disk; +		_limit_fn->cur_value = get_stats_free_disk; +		tmp_limits = alu_sched->limits_fn ; +		_limit_fn->next = tmp_limits; +		alu_sched->limits_fn = _limit_fn; +		 +		if (gf_string2percent (limits->data,  +				       &min_free_disk) != 0) { +			gf_log ("alu", GF_LOG_ERROR,  +				"invalid number format \"%s\" " +				"of \"option scheduler.limits." +				"min-free-disk\"", limits->data); +			return -1; +		} +		alu_sched->spec_limit.free_disk = min_free_disk; +		 +		if (alu_sched->spec_limit.free_disk >= 100) { +			gf_log ("alu", GF_LOG_ERROR, +				"check the \"option scheduler." +				"limits.min-free-disk\", it should " +				"be percentage value"); +			return -1; +		} +		alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */ +		gf_log ("alu", GF_LOG_DEBUG, +			"alu.limit.min-disk-free = %"PRId64"",  +			_limit_fn->cur_value (&(alu_sched->spec_limit))); +	} +	 +	limits = dict_get (xl->options,  +			   "scheduler.limits.max-open-files"); +	if (limits) { +		// Update alu_sched->priority properly +		_limit_fn = CALLOC (1, sizeof (struct alu_limits)); +		ERR_ABORT (_limit_fn); +		_limit_fn->max_value = get_stats_file_usage; +		_limit_fn->cur_value = get_stats_file_usage; +		tmp_limits = alu_sched->limits_fn ; +		_limit_fn->next = tmp_limits; +		alu_sched->limits_fn = _limit_fn; +		if (gf_string2uint64_base10 (limits->data,  +					     &alu_sched->spec_limit.nr_files) != 0) +		{ +			gf_log ("alu", GF_LOG_ERROR,  +				"invalid number format '%s' of option " +				"scheduler.limits.max-open-files",  +				limits->data); +			return -1; +		} +		 +		gf_log ("alu", GF_LOG_DEBUG, +			"alu_init: limit.max-open-files = %"PRId64"", +			_limit_fn->cur_value (&(alu_sched->spec_limit))); +	} + + +	/* Stats refresh options */ +	limits = dict_get (xl->options,  +			   "scheduler.refresh-interval"); +	if (limits) { +		if (gf_string2time (limits->data,  +				    &alu_sched->refresh_interval) != 0)	{ +			gf_log ("alu", GF_LOG_ERROR,  +				"invalid number format \"%s\" of " +				"option scheduler.refresh-interval",  +				limits->data); +			return -1; +		} +	} else { +		alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT; +	} +	gettimeofday (&(alu_sched->last_stat_fetch), NULL); +	 +	 +	limits = dict_get (xl->options,  +			   "scheduler.alu.stat-refresh.num-file-create"); +	if (limits) { +		if (gf_string2uint32 (limits->data,  +				      &alu_sched->refresh_create_count) != 0) +		{ +			gf_log ("alu", GF_LOG_ERROR,  +				"invalid number format \"%s\" of \"option " +				"alu.stat-refresh.num-file-create\"",  +				limits->data); +			return -1; +		} +	} else { +		alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT; +	} + +	{ +		/* Build an array of child_nodes */ +		struct alu_sched_struct *sched_array = NULL; +		xlator_list_t *trav_xl = xl->children; +		data_t *data = NULL; +		int32_t index = 0; +		 +		while (trav_xl) { +			index++; +			trav_xl = trav_xl->next; +		} +		alu_sched->child_count = index; +		sched_array = CALLOC (index, sizeof (struct alu_sched_struct)); +		ERR_ABORT (sched_array); +		trav_xl = xl->children; +		index = 0; +		while (trav_xl) { +			sched_array[index].xl = trav_xl->xlator; +			sched_array[index].eligible = 1; +			index++; +			trav_xl = trav_xl->next; +		} +		alu_sched->array = sched_array; + +		data = dict_get (xl->options,  +				 "scheduler.read-only-subvolumes"); +		if (data) { +			char *child = NULL; +			char *tmp = NULL; +			char *childs_data = strdup (data->data); +       +			child = strtok_r (childs_data, ",", &tmp); +			while (child) { +				for (index = 1; index < alu_sched->child_count; index++) { +					if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) { +						memcpy (&(alu_sched->array[index -1]),  +							&(alu_sched->array[alu_sched->child_count -1]),  +							sizeof (struct alu_sched_struct)); +						alu_sched->child_count--; +						break; +					} +				} +				child = strtok_r (NULL, ",", &tmp); +			} +		} +	} + +	*((long *)xl->private) = (long)alu_sched; + +	/* Initialize all the alu_sched structure's elements */ +	{ +		alu_sched->sched_nodes_pending = 0; + +		alu_sched->min_limit.free_disk = 0x00FFFFFF; +		alu_sched->min_limit.disk_usage = 0xFFFFFFFF; +		alu_sched->min_limit.total_disk_size = 0xFFFFFFFF; +		alu_sched->min_limit.disk_speed = 0xFFFFFFFF; +		alu_sched->min_limit.write_usage = 0xFFFFFFFF; +		alu_sched->min_limit.read_usage = 0xFFFFFFFF; +		alu_sched->min_limit.nr_files = 0xFFFFFFFF; +		alu_sched->min_limit.nr_clients = 0xFFFFFFFF; +	} + +	pthread_mutex_init (&alu_sched->alu_mutex, NULL); +	return 0; +} + +static void +alu_fini (xlator_t *xl) +{ +	if (!xl) +		return; +	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); +	struct alu_limits *limit = alu_sched->limits_fn; +	struct alu_threshold *threshold = alu_sched->threshold_fn; +	void *tmp = NULL; +	pthread_mutex_destroy (&alu_sched->alu_mutex); +	free (alu_sched->array); +	while (limit) { +		tmp = limit; +		limit = limit->next; +		free (tmp); +	} +	while (threshold) { +		tmp = threshold; +		threshold = threshold->next; +		free (tmp); +	} +	free (alu_sched); +} + +static int32_t  +update_stat_array_cbk (call_frame_t *frame, +		       void *cookie, +		       xlator_t *xl, +		       int32_t op_ret, +		       int32_t op_errno, +		       struct xlator_stats *trav_stats) +{ +	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); +	struct alu_limits *limits_fn = alu_sched->limits_fn; +	int32_t idx = 0; +   +	pthread_mutex_lock (&alu_sched->alu_mutex); +	for (idx = 0; idx < alu_sched->child_count; idx++) { +		if (alu_sched->array[idx].xl == (xlator_t *)cookie) +			break; +	} +	pthread_mutex_unlock (&alu_sched->alu_mutex); + +	if (op_ret == -1) { +		alu_sched->array[idx].eligible = 0; +	} else { +		memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats)); +     +		/* Get stats from all the child node */ +		/* Here check the limits specified by the user to  +		   consider the nodes to be used by scheduler */ +		alu_sched->array[idx].eligible = 1; +		limits_fn = alu_sched->limits_fn; +		while (limits_fn){ +			if (limits_fn->max_value &&  +			    (limits_fn->cur_value (trav_stats) >  +			     limits_fn->max_value (&(alu_sched->spec_limit)))) { +				alu_sched->array[idx].eligible = 0; +			} +			if (limits_fn->min_value &&  +			    (limits_fn->cur_value (trav_stats) <  +			     limits_fn->min_value (&(alu_sched->spec_limit)))) { +				alu_sched->array[idx].eligible = 0; +			} +			limits_fn = limits_fn->next; +		} + +		/* Select minimum and maximum disk_usage */ +		if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) { +			alu_sched->max_limit.disk_usage = trav_stats->disk_usage; +		} +		if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) { +			alu_sched->min_limit.disk_usage = trav_stats->disk_usage; +		} + +		/* Select minimum and maximum disk_speed */ +		if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) { +			alu_sched->max_limit.disk_speed = trav_stats->disk_speed; +		} +		if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) { +			alu_sched->min_limit.disk_speed = trav_stats->disk_speed; +		} + +		/* Select minimum and maximum number of open files */ +		if (trav_stats->nr_files > alu_sched->max_limit.nr_files) { +			alu_sched->max_limit.nr_files = trav_stats->nr_files; +		} +		if (trav_stats->nr_files < alu_sched->min_limit.nr_files) { +			alu_sched->min_limit.nr_files = trav_stats->nr_files; +		} + +		/* Select minimum and maximum write-usage */ +		if (trav_stats->write_usage > alu_sched->max_limit.write_usage) { +			alu_sched->max_limit.write_usage = trav_stats->write_usage; +		} +		if (trav_stats->write_usage < alu_sched->min_limit.write_usage) { +			alu_sched->min_limit.write_usage = trav_stats->write_usage; +		} + +		/* Select minimum and maximum read-usage */ +		if (trav_stats->read_usage > alu_sched->max_limit.read_usage) { +			alu_sched->max_limit.read_usage = trav_stats->read_usage; +		} +		if (trav_stats->read_usage < alu_sched->min_limit.read_usage) { +			alu_sched->min_limit.read_usage = trav_stats->read_usage; +		} + +		/* Select minimum and maximum free-disk */ +		if (trav_stats->free_disk > alu_sched->max_limit.free_disk) { +			alu_sched->max_limit.free_disk = trav_stats->free_disk; +		} +		if (trav_stats->free_disk < alu_sched->min_limit.free_disk) { +			alu_sched->min_limit.free_disk = trav_stats->free_disk; +		} +	} + +	STACK_DESTROY (frame->root); + +	return 0; +} + +static void  +update_stat_array (xlator_t *xl) +{ +	/* This function schedules the file in one of the child nodes */ +	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); +	int32_t idx = 0; +	call_frame_t *frame = NULL; +	call_pool_t *pool = xl->ctx->pool; + +	for (idx = 0 ; idx < alu_sched->child_count; idx++) { +		frame = create_frame (xl, pool); + +		STACK_WIND_COOKIE (frame, +				   update_stat_array_cbk,  +				   alu_sched->array[idx].xl, //cookie +				   alu_sched->array[idx].xl,  +				   (alu_sched->array[idx].xl)->mops->stats, +				   0); //flag +	} +	return; +} + +static void  +alu_update (xlator_t *xl) +{ +	struct timeval tv; +	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + +	gettimeofday (&tv, NULL); +	if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) { +		/* Update the stats from all the server */ +		update_stat_array (xl); +		alu_sched->last_stat_fetch.tv_sec = tv.tv_sec; +	} +} + +static xlator_t * +alu_scheduler (xlator_t *xl, const void *path) +{ +	/* This function schedules the file in one of the child nodes */ +	struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); +	int32_t sched_index = 0; +	int32_t sched_index_orig = 0; +	int32_t idx = 0; + +	alu_update (xl); + +	/* Now check each threshold one by one if some nodes are classified */ +	{ +		struct alu_threshold *trav_threshold = alu_sched->threshold_fn; +		struct alu_threshold *tmp_threshold = alu_sched->sched_method; +		struct alu_sched_node *tmp_sched_node;    + +		/* This pointer 'trav_threshold' contains function pointers according to spec file +		   give by user, */ +		while (trav_threshold) { +			/* This check is needed for seeing if already there are nodes in this criteria  +			   to be scheduled */ +			if (!alu_sched->sched_nodes_pending) { +				for (idx = 0; idx < alu_sched->child_count; idx++) { +					if (!alu_sched->array[idx].eligible) { +						continue; +					} +					if (trav_threshold->entry_value) { +						if (trav_threshold->diff_value (&(alu_sched->max_limit), +										&(alu_sched->array[idx].stats)) < +						    trav_threshold->entry_value (&(alu_sched->entry_limit))) { +							continue; +						} +					} +					tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node)); +					ERR_ABORT (tmp_sched_node); +					tmp_sched_node->index = idx; +					if (!alu_sched->sched_node) { +						alu_sched->sched_node = tmp_sched_node; +					} else { +						pthread_mutex_lock (&alu_sched->alu_mutex); +						tmp_sched_node->next = alu_sched->sched_node; +						alu_sched->sched_node = tmp_sched_node; +						pthread_mutex_unlock (&alu_sched->alu_mutex); +					} +					alu_sched->sched_nodes_pending++; +				} +			} /* end of if (sched_nodes_pending) */ + +			/* This loop is required to check the eligible nodes */ +			struct alu_sched_node *trav_sched_node; +			while (alu_sched->sched_nodes_pending) { +				trav_sched_node = alu_sched->sched_node; +				sched_index = trav_sched_node->index; +				if (alu_sched->array[sched_index].eligible) +					break; +				alu_sched->sched_node = trav_sched_node->next; +				free (trav_sched_node); +				alu_sched->sched_nodes_pending--; +			} +			if (alu_sched->sched_nodes_pending) { +				/* There are some node in this criteria to be scheduled, no need  +				 * to sort and check other methods  +				 */ +				if (tmp_threshold && tmp_threshold->exit_value) { +					/* verify the exit value && whether node is eligible or not */ +					if (tmp_threshold->diff_value (&(alu_sched->max_limit), +								       &(alu_sched->array[sched_index].stats)) > +					    tmp_threshold->exit_value (&(alu_sched->exit_limit))) { +						/* Free the allocated info for the node :) */ +						pthread_mutex_lock (&alu_sched->alu_mutex); +						alu_sched->sched_node = trav_sched_node->next; +						free (trav_sched_node); +						trav_sched_node = alu_sched->sched_node; +						alu_sched->sched_nodes_pending--; +						pthread_mutex_unlock (&alu_sched->alu_mutex); +					} +				} else { +					/* if there is no exit value, then exit after scheduling once */ +					pthread_mutex_lock (&alu_sched->alu_mutex); +					alu_sched->sched_node = trav_sched_node->next; +					free (trav_sched_node); +					trav_sched_node = alu_sched->sched_node; +					alu_sched->sched_nodes_pending--; +					pthread_mutex_unlock (&alu_sched->alu_mutex); +				} +	 +				alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */ + +				/* */ +				if (trav_sched_node) { +					tmp_sched_node = trav_sched_node; +					while (trav_sched_node->next) { +						trav_sched_node = trav_sched_node->next; +					} +					if (tmp_sched_node->next) { +						pthread_mutex_lock (&alu_sched->alu_mutex); +						alu_sched->sched_node = tmp_sched_node->next; +						tmp_sched_node->next = NULL; +						trav_sched_node->next = tmp_sched_node; +						pthread_mutex_unlock (&alu_sched->alu_mutex); +					} +				} +				/* return the scheduled node */ +				return alu_sched->array[sched_index].xl; +			} /* end of if (pending_nodes) */ +       +			tmp_threshold = trav_threshold; +			trav_threshold = trav_threshold->next; +		} +	} +   +	/* This is used only when there is everything seems ok, or no eligible nodes */ +	sched_index_orig = alu_sched->sched_index; +	alu_sched->sched_method = NULL; +	while (1) { +		//lock +		pthread_mutex_lock (&alu_sched->alu_mutex); +		sched_index = alu_sched->sched_index++; +		alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; +		pthread_mutex_unlock (&alu_sched->alu_mutex); +		//unlock +		if (alu_sched->array[sched_index].eligible) +			break; +		if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) { +			gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule"); +			//lock +			pthread_mutex_lock (&alu_sched->alu_mutex); +			alu_sched->sched_index++; +			alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; +			pthread_mutex_unlock (&alu_sched->alu_mutex); +			//unlock +			break; +		} +	} +	return alu_sched->array[sched_index].xl; +} + +/** + * notify + */ +void +alu_notify (xlator_t *xl, int32_t event, void *data) +{ +	struct alu_sched *alu_sched = NULL;  +	int32_t idx = 0; +   +	alu_sched = (struct alu_sched *)*((long *)xl->private); +	if (!alu_sched) +		return; + +	for (idx = 0; idx < alu_sched->child_count; idx++) { +		if (alu_sched->array[idx].xl == (xlator_t *)data) +			break; +	} + +	switch (event) +	{ +	case GF_EVENT_CHILD_UP: +	{ +		//alu_sched->array[idx].eligible = 1; +	} +	break; +	case GF_EVENT_CHILD_DOWN: +	{ +		alu_sched->array[idx].eligible = 0; +	} +	break; +	default: +	{ +		; +	} +	break; +	} + +} + +struct sched_ops sched = { +	.init     = alu_init, +	.fini     = alu_fini, +	.update   = alu_update, +	.schedule = alu_scheduler, +	.notify   = alu_notify +}; + +struct volume_options options[] = { +	{ .key   = { "scheduler.alu.order", "alu.order" },   +	  .type  = GF_OPTION_TYPE_ANY  +	}, +	{ .key   = { "scheduler.alu.disk-usage.entry-threshold",  +		     "alu.disk-usage.entry-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET +	}, +	{ .key   = { "scheduler.alu.disk-usage.exit-threshold",  +		     "alu.disk-usage.exit-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET +	}, +	{ .key   = { "scheduler.alu.write-usage.entry-threshold",  +		     "alu.write-usage.entry-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET +	}, +	{ .key   = { "scheduler.alu.write-usage.exit-threshold",  +		     "alu.write-usage.exit-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET  +	}, +	{ .key   = { "scheduler.alu.read-usage.entry-threshold",  +		     "alu.read-usage.entry-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET +	}, +	{ .key   = { "scheduler.alu.read-usage.exit-threshold",  +		     "alu.read-usage.exit-threshold" },   +	  .type  = GF_OPTION_TYPE_SIZET  +	}, +	{ .key   = { "scheduler.alu.open-files-usage.entry-threshold",  +		     "alu.open-files-usage.entry-threshold" },   +	  .type  = GF_OPTION_TYPE_INT +	}, +	{ .key   = { "scheduler.alu.open-files-usage.exit-threshold",  +		     "alu.open-files-usage.exit-threshold" },   +	  .type  = GF_OPTION_TYPE_INT  +	}, +	{ .key   = { "scheduler.read-only-subvolumes", +		     "alu.read-only-subvolumes" },   +	  .type  = GF_OPTION_TYPE_ANY  +	}, +	{ .key   = { "scheduler.refresh-interval",  +		     "alu.refresh-interval", +		     "alu.stat-refresh.interval" }, +	  .type  = GF_OPTION_TYPE_TIME +	}, +	{ .key   = { "scheduler.limits.min-free-disk",  +		     "alu.limits.min-free-disk" },   +	  .type  = GF_OPTION_TYPE_PERCENT +	}, +	{ .key   = { "scheduler.alu.stat-refresh.num-file-create" +		     "alu.stat-refresh.num-file-create"},   +	  .type  = GF_OPTION_TYPE_INT +	}, +	{ .key  =  {NULL}, } +}; diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h new file mode 100644 index 00000000000..a958bb4d298 --- /dev/null +++ b/scheduler/alu/src/alu.h @@ -0,0 +1,89 @@ +/* +   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef _ALU_H +#define _ALU_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" + +struct alu_sched; + +struct alu_sched_struct { +  xlator_t *xl; +  struct xlator_stats stats; +  unsigned char eligible; +}; + +// Write better name for these functions +struct alu_limits { +  struct alu_limits *next; +  int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */ +  int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */ +  int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */ +}; + +struct alu_threshold { +  struct alu_threshold *next; +  int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */ +  int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */ +  int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */ +  int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */ +}; + +struct alu_sched_node { +  struct alu_sched_node *next; +  int32_t index; +}; + +struct alu_sched { +  struct alu_limits *limits_fn; +  struct alu_threshold *threshold_fn; +  struct alu_sched_struct *array; +  struct alu_sched_node *sched_node; +  struct alu_threshold *sched_method; +  struct xlator_stats max_limit; +  struct xlator_stats min_limit; +  struct xlator_stats entry_limit; +  struct xlator_stats exit_limit; +  struct xlator_stats spec_limit;     /* User given limit */ + +  pthread_mutex_t alu_mutex; +  struct timeval last_stat_fetch; +  uint32_t refresh_interval;      /* in seconds */ +  uint32_t refresh_create_count;  /* num-file-create */ + +  int32_t sched_nodes_pending; + +  int32_t sched_index;  /* used for round robin scheduling in case of many nodes getting the criteria match. */ +  int32_t child_count; + +}; + +struct _alu_local_t { +  int32_t call_count; +}; + +typedef struct _alu_local_t alu_local_t; + +#endif /* _ALU_H */ diff --git a/scheduler/nufa/Makefile.am b/scheduler/nufa/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/nufa/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/scheduler/nufa/src/Makefile.am b/scheduler/nufa/src/Makefile.am new file mode 100644 index 00000000000..6eb3d39f1e2 --- /dev/null +++ b/scheduler/nufa/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = nufa.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +nufa_la_LDFLAGS = -module -avoidversion + +nufa_la_SOURCES = nufa.c +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/scheduler/nufa/src/nufa.c b/scheduler/nufa/src/nufa.c new file mode 100644 index 00000000000..bbc61e2ad6f --- /dev/null +++ b/scheduler/nufa/src/nufa.c @@ -0,0 +1,403 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> + +#include "scheduler.h" +#include "common-utils.h" + +struct nufa_sched_struct { +	xlator_t *xl; +	struct timeval last_stat_fetch; +	int64_t free_disk; +	int32_t refresh_interval; +	unsigned char eligible; +}; + +struct nufa_struct { +	struct nufa_sched_struct *array; +	struct timeval last_stat_fetch; + +	int32_t *local_array; /* Used to keep the index of the local xlators */ +	int32_t local_xl_index; /* index in the above array */ +	int32_t local_xl_count; /* Count of the local subvolumes */ + +	uint32_t refresh_interval; +	uint32_t min_free_disk; +   +	gf_lock_t nufa_lock; +	int32_t child_count; +	int32_t sched_index;   +}; + +#define NUFA_LIMITS_MIN_FREE_DISK_DEFAULT    15 +#define NUFA_REFRESH_INTERVAL_DEFAULT        30 + +static int32_t +nufa_init (xlator_t *xl) +{ +	int32_t index = 0; +	data_t *local_name = NULL; +	data_t *data = NULL; +	xlator_list_t *trav_xl = xl->children; +	struct nufa_struct *nufa_buf = NULL; + +	nufa_buf = CALLOC (1, sizeof (struct nufa_struct)); +	ERR_ABORT (nufa_buf); + +	data = dict_get (xl->options, "scheduler.limits.min-free-disk"); +	if (data) { +		if (gf_string2percent (data->data,  +				       &nufa_buf->min_free_disk) != 0) { +			gf_log ("nufa", GF_LOG_ERROR,  +				"invalid number format \"%s\" of " +				"\"option scheduler.limits.min-free-disk\"",  +				data->data); +			return -1; +		} +		if (nufa_buf->min_free_disk >= 100) { +			gf_log ("nufa", GF_LOG_ERROR, +				"check \"option scheduler.limits.min-free-disk" +				"\", it should be percentage value"); +			return -1; +		} +	} else { +		gf_log ("nufa", GF_LOG_WARNING,  +			"No option for limit min-free-disk given, " +			"defaulting it to 15%%"); +		nufa_buf->min_free_disk = NUFA_LIMITS_MIN_FREE_DISK_DEFAULT; +	} +	data = dict_get (xl->options, "scheduler.refresh-interval"); +	if (data && (gf_string2time (data->data,  +				    &nufa_buf->refresh_interval) != 0)) { +		gf_log ("nufa", GF_LOG_ERROR,  +			"invalid number format \"%s\" of " +			"\"option scheduler.refresh-interval\"",  +			data->data); +		return -1; +	} else { +		gf_log ("nufa", GF_LOG_WARNING,  +			"No option for scheduler.refresh-interval given, " +			"defaulting it to 30"); +		nufa_buf->refresh_interval = NUFA_REFRESH_INTERVAL_DEFAULT; +	} +   +	/* Get the array built */ +	while (trav_xl) { +		index++; +		trav_xl = trav_xl->next; +	} +	nufa_buf->child_count = index; +	nufa_buf->sched_index = 0; +	nufa_buf->array = CALLOC (index, sizeof (struct nufa_sched_struct)); +	ERR_ABORT (nufa_buf->array); +	nufa_buf->local_array = CALLOC (index, sizeof (int32_t)); +	ERR_ABORT (nufa_buf->array); +	trav_xl = xl->children; +	 +	local_name = dict_get (xl->options, "scheduler.local-volume-name"); +	if (!local_name) { +		/* Error */ +		gf_log ("nufa", GF_LOG_ERROR,  +			"No 'local-volume-name' option given in volume file"); +		FREE (nufa_buf->array); +		FREE (nufa_buf->local_array); +		FREE (nufa_buf); +		return -1; +	} + +	/* Get the array properly */ +	index = 0; +	trav_xl = xl->children; +	while (trav_xl) { +		nufa_buf->array[index].xl = trav_xl->xlator; +		nufa_buf->array[index].eligible = 1; +		nufa_buf->array[index].free_disk = nufa_buf->min_free_disk; +		nufa_buf->array[index].refresh_interval =  +			nufa_buf->refresh_interval; + +		trav_xl = trav_xl->next; +		index++; +	} +   +	{  +		int32_t array_index = 0; +		char *child = NULL; +		char *tmp = NULL; +		char *childs_data = strdup (local_name->data); +		 +		child = strtok_r (childs_data, ",", &tmp); +		while (child) { +			/* Check if the local_volume specified is proper  +			   subvolume of unify */ +			trav_xl = xl->children; +			index=0; +			while (trav_xl) { +				if (strcmp (child, trav_xl->xlator->name) == 0) +					break; +				trav_xl = trav_xl->next; +				index++; +			} + +			if (!trav_xl) { +				/* entry for 'local-volume-name' is wrong,  +				   not present in subvolumes */ +				gf_log ("nufa", GF_LOG_ERROR,  +					"option 'scheduler.local-volume-name' " +					"%s is wrong", child); +				FREE (nufa_buf->array); +				FREE (nufa_buf->local_array); +				FREE (nufa_buf); +				return -1; +			} else { +				nufa_buf->local_array[array_index++] = index; +				nufa_buf->local_xl_count++; +			} +			child = strtok_r (NULL, ",", &tmp); +		} +		free (childs_data); +	} + +	LOCK_INIT (&nufa_buf->nufa_lock); +	*((long *)xl->private) = (long)nufa_buf; // put it at the proper place +	return 0; +} + +static void +nufa_fini (xlator_t *xl) +{ +	struct nufa_struct *nufa_buf =  +		(struct nufa_struct *)*((long *)xl->private); + +	LOCK_DESTROY (&nufa_buf->nufa_lock); +	FREE (nufa_buf->local_array); +	FREE (nufa_buf->array); +	FREE (nufa_buf); +} + +static int32_t  +update_stat_array_cbk (call_frame_t *frame, +		       void *cookie, +		       xlator_t *xl, +		       int32_t op_ret, +		       int32_t op_errno, +		       struct xlator_stats *trav_stats) +{ +	struct nufa_struct *nufa_struct = NULL;  +	int32_t idx = 0; +	int32_t percent = 0; + +	nufa_struct = (struct nufa_struct *)*((long *)xl->private); +	LOCK (&nufa_struct->nufa_lock); +	for (idx = 0; idx < nufa_struct->child_count; idx++) { +		if (nufa_struct->array[idx].xl->name == (char *)cookie) +			break; +	} +	UNLOCK (&nufa_struct->nufa_lock); + +	if (op_ret == 0) { +		percent = ((trav_stats->free_disk * 100) /  +			   trav_stats->total_disk_size); +		if (nufa_struct->array[idx].free_disk > percent) { +			if (nufa_struct->array[idx].eligible) +				gf_log ("nufa", GF_LOG_CRITICAL, +					"node \"%s\" is _almost_ (%d %%) full", +					nufa_struct->array[idx].xl->name,  +					100 - percent); +			nufa_struct->array[idx].eligible = 0; +		} else { +			nufa_struct->array[idx].eligible = 1; +		} +	} else { +		nufa_struct->array[idx].eligible = 0; +	} + +	STACK_DESTROY (frame->root); +	return 0; +} + +static void  +update_stat_array (xlator_t *xl) +{ +	/* This function schedules the file in one of the child nodes */ +	int32_t idx; +	struct nufa_struct *nufa_buf =  +		(struct nufa_struct *)*((long *)xl->private); +	call_frame_t *frame = NULL; +	call_pool_t *pool = xl->ctx->pool; + +	for (idx = 0; idx < nufa_buf->child_count; idx++) { +		frame = create_frame (xl, pool); + +		STACK_WIND_COOKIE (frame, +				   update_stat_array_cbk,  +				   nufa_buf->array[idx].xl->name, +				   nufa_buf->array[idx].xl,  +				   (nufa_buf->array[idx].xl)->mops->stats, +				   0); //flag +	} + +	return; +} + +static void  +nufa_update (xlator_t *xl) +{ +	struct nufa_struct *nufa_buf =  +		(struct nufa_struct *)*((long *)xl->private); +	struct timeval tv; +	gettimeofday (&tv, NULL); +	if (tv.tv_sec > (nufa_buf->refresh_interval  +			 + nufa_buf->last_stat_fetch.tv_sec)) { +		/* Update the stats from all the server */ +		update_stat_array (xl); +		nufa_buf->last_stat_fetch.tv_sec = tv.tv_sec; +	} +} + +static xlator_t * +nufa_schedule (xlator_t *xl, const void *path) +{ +	struct nufa_struct *nufa_buf =  +		(struct nufa_struct *)*((long *)xl->private); +	int32_t nufa_orig = nufa_buf->local_xl_index;   +	int32_t rr; +   +	nufa_update (xl); +   +	while (1) { +		LOCK (&nufa_buf->nufa_lock); +		rr = nufa_buf->local_xl_index++; +		nufa_buf->local_xl_index %= nufa_buf->local_xl_count; +		UNLOCK (&nufa_buf->nufa_lock); +     +		/* if 'eligible' or there are _no_ eligible nodes */ +		if (nufa_buf->array[nufa_buf->local_array[rr]].eligible) { +			/* Return the local node */ +			return nufa_buf->array[nufa_buf->local_array[rr]].xl; +		} +		if ((rr + 1) % nufa_buf->local_xl_count == nufa_orig) { +			gf_log ("nufa", GF_LOG_CRITICAL,  +				"No free space available on any local " +				"volumes, using RR scheduler"); +			LOCK (&nufa_buf->nufa_lock); +			nufa_buf->local_xl_index++; +			nufa_buf->local_xl_index %= nufa_buf->local_xl_count; +			UNLOCK (&nufa_buf->nufa_lock); +			break; +		} +	} + +	nufa_orig = nufa_buf->sched_index;   +	while (1) { +		LOCK (&nufa_buf->nufa_lock); +		rr = nufa_buf->sched_index++; +		nufa_buf->sched_index = (nufa_buf->sched_index %  +					 nufa_buf->child_count); +		UNLOCK (&nufa_buf->nufa_lock); +     +		/* if 'eligible' or there are _no_ eligible nodes */ +		if (nufa_buf->array[rr].eligible) { +			break; +		} +		if ((rr + 1) % nufa_buf->child_count == nufa_orig) { +			gf_log ("nufa", GF_LOG_CRITICAL,  +				"No free space available on any server, " +				"using RR scheduler."); +			LOCK (&nufa_buf->nufa_lock); +			nufa_buf->sched_index++; +			nufa_buf->sched_index = (nufa_buf->sched_index %  +						 nufa_buf->child_count); +			UNLOCK (&nufa_buf->nufa_lock); +			break; +		} +	} +	return nufa_buf->array[rr].xl; +} + + +/** + * notify + */ +void +nufa_notify (xlator_t *xl, int32_t event, void *data) +{ +	struct nufa_struct *nufa_buf =  +		(struct nufa_struct *)*((long *)xl->private); +	int32_t idx = 0; +   +	if (!nufa_buf) +		return; + +	for (idx = 0; idx < nufa_buf->child_count; idx++) { +		if (strcmp (nufa_buf->array[idx].xl->name,  +			    ((xlator_t *)data)->name) == 0) +			break; +	} + +	switch (event) +	{ +	case GF_EVENT_CHILD_UP: +	{ +		//nufa_buf->array[idx].eligible = 1; +	} +	break; +	case GF_EVENT_CHILD_DOWN: +	{ +		nufa_buf->array[idx].eligible = 0; +	} +	break; +	default: +	{ +		; +	} +	break; +	} + +} + +struct sched_ops sched = { +	.init     = nufa_init, +	.fini     = nufa_fini, +	.update   = nufa_update, +	.schedule = nufa_schedule, +	.notify   = nufa_notify +}; + +struct volume_options options[] = { +	{ .key   = { "scheduler.refresh-interval",  +		     "nufa.refresh-interval" },   +	  .type  = GF_OPTION_TYPE_TIME +	}, +	{ .key   = { "scheduler.limits.min-free-disk",  +		     "nufa.limits.min-free-disk" },   +	  .type  = GF_OPTION_TYPE_PERCENT +	}, +	{ .key   = { "scheduler.local-volume-name", +		     "nufa.local-volume-name" }, +	  .type  = GF_OPTION_TYPE_XLATOR +	},	 +	{ .key = {NULL} } +}; + diff --git a/scheduler/random/Makefile.am b/scheduler/random/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/random/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/scheduler/random/src/Makefile.am b/scheduler/random/src/Makefile.am new file mode 100644 index 00000000000..572181336c2 --- /dev/null +++ b/scheduler/random/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = random.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +random_la_LDFLAGS = -module -avoidversion + +random_la_SOURCES = random.c +random_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = random.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/scheduler/random/src/random.c b/scheduler/random/src/random.c new file mode 100644 index 00000000000..9e761d08a54 --- /dev/null +++ b/scheduler/random/src/random.c @@ -0,0 +1,283 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include <stdlib.h> +#include <sys/time.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "random.h" + +#define RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT    15 +#define RANDOM_REFRESH_INTERVAL_DEFAULT        10 + + +static int32_t +random_init (xlator_t *xl) +{ +	struct random_struct *random_buf = NULL; +	xlator_list_t *trav_xl = xl->children; +	data_t *limit = NULL; +	int32_t index = 0; + +	random_buf = CALLOC (1, sizeof (struct random_struct)); +	ERR_ABORT (random_buf); +   +	/* Set the seed for the 'random' function */ +	srandom ((uint32_t) time (NULL)); +   +	limit = dict_get (xl->options, "scheduler.limits.min-free-disk"); +	if (limit) { +		if (gf_string2percent (data_to_str (limit), +				       &random_buf->min_free_disk) != 0) { +			gf_log ("random", GF_LOG_ERROR,  +				"invalid number format \"%s\" of \"option " +				"scheduler.limits.min-free-disk\"",  +				limit->data); +			return -1; +		} +		if (random_buf->min_free_disk >= 100) { +			gf_log ("random", GF_LOG_ERROR, +				"check the \"option random.limits.min-free" +				"-disk\", it should be percentage value"); +			return -1; +		} +       +	} else { +		gf_log ("random", GF_LOG_WARNING,  +			"No option for limit min-free-disk given, " +			"defaulting it to 5%%"); +		random_buf->min_free_disk =  +			RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT; +	} +   +	limit = dict_get (xl->options, "scheduler.refresh-interval"); +	if (limit) { +		if (gf_string2time (data_to_str (limit), +				    &random_buf->refresh_interval) != 0) { +			gf_log ("random", GF_LOG_ERROR,  +				"invalid number format \"%s\" of " +				"\"option random.refresh-interval\"",  +				limit->data); +			return -1; +		} +	} else { +		random_buf->refresh_interval = RANDOM_REFRESH_INTERVAL_DEFAULT; +	} +   +	while (trav_xl) { +		index++; +		trav_xl = trav_xl->next; +	} +	random_buf->child_count = index; +	random_buf->array = CALLOC (index,  +				    sizeof (struct random_sched_struct)); +	ERR_ABORT (random_buf->array); +	trav_xl = xl->children; +	index = 0; + +	while (trav_xl) { +		random_buf->array[index].xl = trav_xl->xlator; +		random_buf->array[index].eligible = 1; +		trav_xl = trav_xl->next; +		index++; +	} +	pthread_mutex_init (&random_buf->random_mutex, NULL); + +	// put it at the proper place   +	*((long *)xl->private) = (long)random_buf;  +	return 0; +} + +static void +random_fini (xlator_t *xl) +{ +	struct random_struct *random_buf = NULL; + +	random_buf = (struct random_struct *)*((long *)xl->private); +	pthread_mutex_destroy (&random_buf->random_mutex); +	free (random_buf->array); +	free (random_buf); +} + + +static int32_t  +update_stat_array_cbk (call_frame_t *frame, +		       void *cookie, +		       xlator_t *xl, +		       int32_t op_ret, +		       int32_t op_errno, +		       struct xlator_stats *trav_stats) +{ +	int32_t idx = 0; +	int32_t percent = 0; +	struct random_struct *random_buf = NULL; +	 +	random_buf = (struct random_struct *)*((long *)xl->private); + +	pthread_mutex_lock (&random_buf->random_mutex); +	for (idx = 0; idx < random_buf->child_count; idx++) { +		if (strcmp (random_buf->array[idx].xl->name,  +			    (char *)cookie) == 0) +			break; +	} +	pthread_mutex_unlock (&random_buf->random_mutex); + +	if (op_ret == 0) { +		percent = ((trav_stats->free_disk *100) /  +			   trav_stats->total_disk_size); +		if (random_buf->min_free_disk > percent) { +			random_buf->array[idx].eligible = 0; +		} else { +			random_buf->array[idx].eligible = 1; +		} +	} else { +		random_buf->array[idx].eligible = 0; +	}     + +	STACK_DESTROY (frame->root); +	return 0; +} + +static void  +update_stat_array (xlator_t *xl) +{ +	int32_t idx; +	struct random_struct *random_buf = NULL; +	call_frame_t *frame = NULL; +	call_pool_t *pool = xl->ctx->pool; + +	random_buf = (struct random_struct *)*((long *)xl->private); +	for (idx = 0; idx < random_buf->child_count; idx++) { +		frame = create_frame (xl, pool); + +		STACK_WIND_COOKIE (frame, +				   update_stat_array_cbk, +				   random_buf->array[idx].xl->name, +				   random_buf->array[idx].xl, +				   (random_buf->array[idx].xl)->mops->stats, +				   0); +	} +	return ; +} + +static void  +random_update (xlator_t *xl) +{ +	struct timeval tv; +	struct random_struct *random_buf = NULL; + +	random_buf = (struct random_struct *)*((long *)xl->private); + +	gettimeofday(&tv, NULL); +	if (tv.tv_sec > (random_buf->refresh_interval +  +			 random_buf->last_stat_entry.tv_sec)) { +		update_stat_array (xl); +		random_buf->last_stat_entry.tv_sec = tv.tv_sec; +	} +} + +static xlator_t * +random_schedule (xlator_t *xl, const void *path) +{ +	struct random_struct *random_buf = NULL;        +	int32_t rand = 0; +	int32_t try = 0; +	 +	random_buf = (struct random_struct *)*((long *)xl->private); + +	rand = (int32_t) (1.0*random_buf->child_count *  +			  (random() / (RAND_MAX + 1.0))); + +	random_update (xl); + +	while (!random_buf->array[rand].eligible) { +		if (try++ > 100) { +			/* there is a chance of this becoming a  +			   infinite loop otherwise. */ +			break;  +		} +		rand = (int32_t) (1.0*random_buf->child_count *  +				  (random() / (RAND_MAX + 1.0))); +	} +	return random_buf->array[rand].xl; +} + + +/** + * notify + */ +void +random_notify (xlator_t *xl, int32_t event, void *data) +{ +	struct random_struct *random_buf = NULL; +	int32_t idx = 0; +   +	random_buf = (struct random_struct *)*((long *)xl->private); +	if (!random_buf) +		return; + +	for (idx = 0; idx < random_buf->child_count; idx++) { +		if (random_buf->array[idx].xl == (xlator_t *)data) +			break; +	} + +	switch (event) +	{ +	case GF_EVENT_CHILD_UP: +	{ +		//random_buf->array[idx].eligible = 1; +	} +	break; +	case GF_EVENT_CHILD_DOWN: +	{ +		random_buf->array[idx].eligible = 0; +	} +	break; +	default: +	{ +		; +	} +	break; +	} + +} + +struct sched_ops sched = { +	.init     = random_init, +	.fini     = random_fini, +	.update   = random_update, +	.schedule = random_schedule, +	.notify   = random_notify +}; + +struct volume_options options[] = { +	{ .key   = { "scheduler.refresh-interval",  +		     "random.refresh-interval" },   +	  .type  = GF_OPTION_TYPE_TIME +	}, +	{ .key   = { "scheduler.limits.min-free-disk",  +		     "random.limits.min-free-disk" },   +	  .type  = GF_OPTION_TYPE_PERCENT +	}, +	{ .key = {NULL} } +}; diff --git a/scheduler/random/src/random.h b/scheduler/random/src/random.h new file mode 100644 index 00000000000..35c9e02ee2b --- /dev/null +++ b/scheduler/random/src/random.h @@ -0,0 +1,46 @@ +/* +   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RANDOM_H +#define _RANDOM_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include <sys/time.h> +#include "scheduler.h" + +struct random_sched_struct { +  xlator_t *xl; +  unsigned char eligible; +}; + +struct random_struct { +  int32_t child_count; +  uint32_t refresh_interval; +  uint32_t min_free_disk; +  struct timeval last_stat_entry; +  struct random_sched_struct *array; +  pthread_mutex_t random_mutex; +}; + +#endif /* _RANDOM_H */ diff --git a/scheduler/rr/Makefile.am b/scheduler/rr/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/rr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/scheduler/rr/src/Makefile.am b/scheduler/rr/src/Makefile.am new file mode 100644 index 00000000000..7e911c0eda8 --- /dev/null +++ b/scheduler/rr/src/Makefile.am @@ -0,0 +1,13 @@ +sched_LTLIBRARIES = rr.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +rr_la_LDFLAGS = -module -avoidversion + +rr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +rr_la_SOURCES = rr.c rr-options.c +noinst_HEADERS = rr.h rr-options.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/scheduler/rr/src/rr-options.c b/scheduler/rr/src/rr-options.c new file mode 100644 index 00000000000..3f0ffcaf2e9 --- /dev/null +++ b/scheduler/rr/src/rr-options.c @@ -0,0 +1,256 @@ +/* +  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include "scheduler.h" +#include "rr-options.h" + +#define RR_LIMITS_MIN_FREE_DISK_OPTION_STRING  "scheduler.limits.min-free-disk" +#define RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT  15 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MIN      0 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MAX      100 + +#define RR_REFRESH_INTERVAL_OPTION_STRING      "scheduler.refresh-interval" +#define RR_REFRESH_INTERVAL_VALUE_DEFAULT      10 + +#define RR_READ_ONLY_SUBVOLUMES_OPTION_STRING  "scheduler.read-only-subvolumes" + +#define LOG_ERROR(args...)      gf_log ("rr-options", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...)    gf_log ("rr-options", GF_LOG_WARNING, ##args) + +static int  +_rr_options_min_free_disk_validate (const char *value_string, uint32_t *n) +{ +	uint32_t value = 0; +   +	if (value_string == NULL) +	{ +		return -1; +	} +   +	if (gf_string2percent (value_string, &value) != 0) +	{ +		gf_log ("rr",  +			GF_LOG_ERROR,  +			"invalid number format [%s] of option [%s]",  +			value_string,  +			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); +		return -1; +	} +   +	if ((value <= RR_LIMITS_MIN_FREE_DISK_VALUE_MIN) ||  +	    (value >= RR_LIMITS_MIN_FREE_DISK_VALUE_MAX)) +	{ +		gf_log ("rr",  +			GF_LOG_ERROR,  +			"out of range [%d] of option [%s].  Allowed range is 0 to 100.",  +			value,  +			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); +		return -1; +	} +   +	*n = value; +   +	return 0; +} + +static int  +_rr_options_refresh_interval_validate (const char *value_string, uint32_t *n) +{ +	uint32_t value = 0; +   +	if (value_string == NULL) +	{ +		return -1; +	} +   +	if (gf_string2time (value_string, &value) != 0) +	{ +		gf_log ("rr",  +			GF_LOG_ERROR,  +			"invalid number format [%s] of option [%s]",  +			value_string,  +			RR_REFRESH_INTERVAL_OPTION_STRING); +		return -1; +	} +   +	*n = value; +   +	return 0; +} + +static int  +_rr_options_read_only_subvolumes_validate (const char *value_string,  +					   char ***volume_list,  +					   uint64_t *volume_count) +{ +	char **vlist = NULL; +	int vcount = 0; +	int i = 0; +   +	if (value_string == NULL || volume_list == NULL || volume_count) +	{ +		return -1; +	} +   +	if (gf_strsplit (value_string,  +			 ", ",  +			 &vlist,  +			 &vcount) != 0) +	{ +		gf_log ("rr",  +			GF_LOG_ERROR,  +			"invalid subvolume list [%s] of option [%s]",  +			value_string,  +			RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); +		return -1; +	} +   +	for (i = 0; i < vcount; i++) +	{ +		if (gf_volume_name_validate (vlist[i]) != 0) +		{ +			gf_log ("rr",  +				GF_LOG_ERROR,  +				"invalid subvolume name [%s] in [%s] of option [%s]",  +				vlist[i],  +				value_string,  +				RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); +			goto free_exit; +		} +	} +   +	*volume_list = vlist; +	*volume_count = vcount; +   +	return 0; +   + free_exit: +	for (i = 0; i < vcount; i++) +	{ +		free (vlist[i]); +	} +	free (vlist); +   +	return -1; +} + +int  +rr_options_validate (dict_t *options, rr_options_t *rr_options) +{ +	char *value_string = NULL; +   +	if (options == NULL || rr_options == NULL) +	{ +		return -1; +	} +   +	if (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)) +		if (data_to_str (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING))) +			value_string = data_to_str (dict_get (options,  +							      RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)); +	if (value_string != NULL) +	{ +		if (_rr_options_min_free_disk_validate (value_string,  +							&rr_options->min_free_disk) != 0) +		{ +			return -1; +		} +       +		gf_log ("rr",  +			GF_LOG_WARNING,  +			"using %s = %d",  +			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING,  +			rr_options->min_free_disk); +	} +	else  +	{ +		rr_options->min_free_disk = RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT; +       +		gf_log ("rr", GF_LOG_DEBUG,  +			"using %s = %d [default]",  +			RR_LIMITS_MIN_FREE_DISK_OPTION_STRING,  +			rr_options->min_free_disk); +	} +   +	value_string = NULL; +	if (dict_get (options, RR_REFRESH_INTERVAL_OPTION_STRING)) +		value_string = data_to_str (dict_get (options,  +						      RR_REFRESH_INTERVAL_OPTION_STRING)); +	if (value_string != NULL) +	{ +		if (_rr_options_refresh_interval_validate (value_string,  +							   &rr_options->refresh_interval) != 0) +		{ +			return -1; +		} +       +		gf_log ("rr",  +			GF_LOG_WARNING,  +			"using %s = %d",  +			RR_REFRESH_INTERVAL_OPTION_STRING,  +			rr_options->refresh_interval); +	} +	else  +	{ +		rr_options->refresh_interval = RR_REFRESH_INTERVAL_VALUE_DEFAULT; +       +		gf_log ("rr", GF_LOG_DEBUG,  +			"using %s = %d [default]",  +			RR_REFRESH_INTERVAL_OPTION_STRING,  +			rr_options->refresh_interval); +	} +   +	value_string = NULL; +	if (dict_get (options, RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)) +		value_string = data_to_str (dict_get (options,  +						      RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)); +	if (value_string != NULL) +	{ +		if (_rr_options_read_only_subvolumes_validate (value_string,  +							       &rr_options->read_only_subvolume_list,  +							       &rr_options->read_only_subvolume_count) != 0) +		{ +			return -1; +		} +       +		gf_log ("rr",  +			GF_LOG_WARNING,  +			"using %s = [%s]",  +			RR_READ_ONLY_SUBVOLUMES_OPTION_STRING,  +			value_string); +	} +   +	return 0; +} + +struct volume_options options[] = { +	{ .key   = { "scheduler.refresh-interval",  +		     "rr.refresh-interval" },   +	  .type  = GF_OPTION_TYPE_TIME +	}, +	{ .key   = { "scheduler.limits.min-free-disk",  +		     "rr.limits.min-free-disk" },   +	  .type  = GF_OPTION_TYPE_PERCENT +	}, +	{ .key   = { "scheduler.read-only-subvolumes",  +		     "rr.read-only-subvolumes" },   +	  .type  = GF_OPTION_TYPE_ANY +	}, +	{ .key = {NULL} } +}; diff --git a/scheduler/rr/src/rr-options.h b/scheduler/rr/src/rr-options.h new file mode 100644 index 00000000000..4818c7d491c --- /dev/null +++ b/scheduler/rr/src/rr-options.h @@ -0,0 +1,34 @@ +/* +   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RR_OPTIONS_H +#define _RR_OPTIONS_H + +struct rr_options +{ +  uint32_t min_free_disk; +  uint32_t refresh_interval; +  char     **read_only_subvolume_list; +  uint64_t read_only_subvolume_count; +}; +typedef struct rr_options rr_options_t; + +int rr_options_validate (dict_t *options, rr_options_t *rr_options); + +#endif diff --git a/scheduler/rr/src/rr.c b/scheduler/rr/src/rr.c new file mode 100644 index 00000000000..3e54ff5e1b6 --- /dev/null +++ b/scheduler/rr/src/rr.c @@ -0,0 +1,565 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> +#include <stdlib.h> + +#include <stdint.h> + +#include "scheduler.h" + +#include "rr-options.h" +#include "rr.h" + +#define RR_MIN_FREE_DISK_NOT_REACHED    0 +#define RR_MIN_FREE_DISK_REACHED        1 + +#define RR_SUBVOLUME_OFFLINE    0 +#define RR_SUBVOLUME_ONLINE     1 + +#define LOG_ERROR(args...)      gf_log ("rr", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...)    gf_log ("rr", GF_LOG_WARNING, ##args) +#define LOG_CRITICAL(args...)    gf_log ("rr", GF_LOG_CRITICAL, ##args) + +#define ROUND_ROBIN(index, count)    ((index + 1) % count) + +static int  +_cleanup_rr (rr_t *rr) +{ +	int i; +   +	if (rr == NULL) +	{ +		return -1; +	} +   +	if (rr->options.read_only_subvolume_list != NULL) +	{ +		for (i = 0; i < rr->options.read_only_subvolume_count; i++) +		{ +			free (rr->options.read_only_subvolume_list[i]); +		} +		free (rr->options.read_only_subvolume_list); +	} +   +	free (rr->subvolume_list); +   +	free (rr); +   +	return 0; +} + +int  +rr_init (xlator_t *this_xl) +{ +	rr_t *rr = NULL; +	dict_t *options = NULL; +	xlator_list_t *children = NULL; +	uint64_t children_count = 0; +	int i = 0; +	int j = 0; +   +	if (this_xl == NULL) +	{ +		return -1; +	} +   +	if ((options = this_xl->options) == NULL) +	{ +		return -1; +	} +   +	if ((children = this_xl->children) == NULL) +	{ +		return -1; +	} +   +	if ((rr = CALLOC (1, sizeof (rr_t))) == NULL) +	{ +		return -1; +	} +   +	if (rr_options_validate (options, &rr->options) != 0) +	{ +		free (rr); +		return -1; +	} +   +	for (i = 0; i < rr->options.read_only_subvolume_count; i++) +	{ +		char found = 0; +       +		for (children = this_xl->children;  +		     children != NULL;  +		     children = children->next) +		{ +			if (strcmp (rr->options.read_only_subvolume_list[i],  +				    children->xlator->name) == 0) +			{ +				found = 1; +				break; +			} +		} +       +		if (!found) +		{ +			LOG_ERROR ("read-only subvolume [%s] not found in volume list",  +				   rr->options.read_only_subvolume_list[i]); +			_cleanup_rr (rr); +			return -1; +		} +	} +   +	for (children = this_xl->children;  +	     children != NULL;  +	     children = children->next) +	{ +		children_count++; +	} +   +	/* bala: excluding read_only_subvolumes */ +	if ((rr->subvolume_count = children_count -  +	     rr->options.read_only_subvolume_count) == 0) +	{ +		LOG_ERROR ("no writable volumes found for scheduling"); +		_cleanup_rr (rr); +		return -1; +	} +   +	if ((rr->subvolume_list = CALLOC (rr->subvolume_count,  +					  sizeof (rr_subvolume_t))) == NULL) +	{ +		_cleanup_rr (rr); +		return -1; +	} +   +	i = 0; +	j = 0; +	for (children = this_xl->children;  +	     children != NULL;  +	     children = children->next) +	{ +		char found = 0; +       +		for (j = 0; j < rr->options.read_only_subvolume_count; j++) +		{ +			if (strcmp (rr->options.read_only_subvolume_list[i],  +				    children->xlator->name) == 0) +			{ +				found = 1; +				break; +			} +		} +       +		if (!found) +		{ +			rr_subvolume_t *subvolume = NULL; +	   +			subvolume = &rr->subvolume_list[i]; +	   +			subvolume->xl = children->xlator; +			subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; +			subvolume->status = RR_SUBVOLUME_ONLINE; +	   +			i++; +		} +	} +   +	rr->schedule_index = UINT64_MAX; +	rr->last_stat_fetched_time.tv_sec = 0; +	rr->last_stat_fetched_time.tv_usec = 0; +	pthread_mutex_init (&rr->mutex, NULL); +   +	*((long *)this_xl->private) = (long)rr; +   +	return 0; +} + +void  +rr_fini (xlator_t *this_xl) +{ +	rr_t *rr = NULL; +   +	if (this_xl == NULL) +	{ +		return; +	} +   +	if ((rr = (rr_t *) *((long *)this_xl->private)) != NULL) +	{ +		pthread_mutex_destroy (&rr->mutex); +		_cleanup_rr (rr); +		this_xl->private = NULL; +	} +   +	return; +} + +xlator_t * +rr_schedule (xlator_t *this_xl, const void *path) +{ +	rr_t *rr = NULL; +	uint64_t next_schedule_index = 0; +	int i = 0; +   +	if (this_xl == NULL || path == NULL) +	{ +		return NULL; +	} +   +	rr = (rr_t *) *((long *)this_xl->private); +	next_schedule_index = ROUND_ROBIN (rr->schedule_index,  +					   rr->subvolume_count); +   +	rr_update (this_xl); +   +	for (i = next_schedule_index; i < rr->subvolume_count; i++) +	{ +		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE &&  +		    rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) +		{ +			pthread_mutex_lock (&rr->mutex); +			rr->schedule_index = i; +			pthread_mutex_unlock (&rr->mutex); +			return rr->subvolume_list[i].xl; +		} +	} +   +	for (i = 0; i < next_schedule_index; i++) +	{ +		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE &&  +		    rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) +		{ +			pthread_mutex_lock (&rr->mutex); +			rr->schedule_index = i; +			pthread_mutex_unlock (&rr->mutex); +			return rr->subvolume_list[i].xl; +		} +	} +   +	for (i = next_schedule_index; i < rr->subvolume_count; i++) +	{ +		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) +		{ +			pthread_mutex_lock (&rr->mutex); +			rr->schedule_index = i; +			pthread_mutex_unlock (&rr->mutex); +			return rr->subvolume_list[i].xl; +		} +	} +   +	for (i = 0; i < next_schedule_index; i++) +	{ +		if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) +		{ +			pthread_mutex_lock (&rr->mutex); +			rr->schedule_index = i; +			pthread_mutex_unlock (&rr->mutex); +			return rr->subvolume_list[i].xl; +		} +	} +   +	return NULL; +} + +void +rr_update (xlator_t *this_xl) +{ +	rr_t *rr = NULL; +	struct timeval ctime = {0, 0}; +	int i = 0; +   +	if (this_xl == NULL) +	{ +		return ; +	} +   +	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) +	{ +		return ; +	} +   +	if (gettimeofday (&ctime, NULL) != 0) +	{ +		return ; +	} +   +	if (ctime.tv_sec > (rr->options.refresh_interval +  +			    rr->last_stat_fetched_time.tv_sec)) +	{ +		pthread_mutex_lock (&rr->mutex); +		rr->last_stat_fetched_time = ctime; +		pthread_mutex_unlock (&rr->mutex); +       +		for (i = 0; i < rr->subvolume_count; i++) +		{ +			xlator_t *subvolume_xl = NULL; +			call_frame_t *frame = NULL; +			call_pool_t *pool = NULL; +	   +			subvolume_xl = rr->subvolume_list[i].xl; +	   +			pool = this_xl->ctx->pool; +	   +			frame = create_frame (this_xl, pool); + +			STACK_WIND_COOKIE (frame, +					   rr_update_cbk,  +					   subvolume_xl->name,  +					   subvolume_xl,  +					   subvolume_xl->mops->stats,  +					   0); +		} +	} +   +	return ; +} + +int  +rr_update_cbk (call_frame_t *frame,  +	       void *cookie,  +	       xlator_t *this_xl,  +	       int32_t op_ret,  +	       int32_t op_errno,  +	       struct xlator_stats *stats) +{ +	rr_t *rr = NULL; +	rr_subvolume_t *subvolume = NULL; +	uint8_t free_disk_percent = 0; +	int i = 0; +   +	if (frame == NULL) +	{ +		return -1; +	} +   +	if (cookie == NULL || this_xl == NULL) +	{ +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	if (op_ret == 0 && stats == NULL) +	{ +		LOG_CRITICAL ("fatal! op_ret is 0 and stats is NULL.  " +			      "Please report this to <gluster-devel@nongnu.org>"); +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) +	{ +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	for (i = 0; i < rr->subvolume_count; i++) +	{ +		if (rr->subvolume_list[i].xl->name == (char *) cookie) +		{ +			subvolume = &rr->subvolume_list[i]; +			break; +		} +	} +   +	if (subvolume == NULL) +	{ +		LOG_ERROR ("unknown cookie [%s]", (char *) cookie); +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	if (op_ret == 0) +	{ +		free_disk_percent = (stats->free_disk * 100) / stats->total_disk_size; +		if (free_disk_percent > rr->options.min_free_disk) +		{ +			if (subvolume->free_disk_status != RR_MIN_FREE_DISK_NOT_REACHED) +			{ +				pthread_mutex_lock (&rr->mutex); +				subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; +				pthread_mutex_unlock (&rr->mutex); +				LOG_WARNING ("subvolume [%s] is available with free space for scheduling",  +					     subvolume->xl->name); +			} +		} +		else +		{ +			if (subvolume->free_disk_status != RR_MIN_FREE_DISK_REACHED) +			{ +				pthread_mutex_lock (&rr->mutex); +				subvolume->free_disk_status = RR_MIN_FREE_DISK_REACHED; +				pthread_mutex_unlock (&rr->mutex); +				LOG_WARNING ("subvolume [%s] reached minimum disk space requirement",  +					     subvolume->xl->name); +			} +		} +	} +	else  +	{ +		pthread_mutex_lock (&rr->mutex); +		subvolume->status = RR_SUBVOLUME_OFFLINE; +		pthread_mutex_unlock (&rr->mutex); +		LOG_ERROR ("unable to get subvolume [%s] status information and " +			   "scheduling is disabled",  +			   subvolume->xl->name); +	} +   +	STACK_DESTROY (frame->root); +	return 0; +} + +void +rr_notify (xlator_t *this_xl, int32_t event, void *data) +{ +	rr_t *rr = NULL; +	rr_subvolume_t *subvolume = NULL; +	xlator_t *subvolume_xl = NULL; +	int i = 0, ret = 0; +	call_frame_t *frame = NULL; +	call_pool_t *pool = NULL; +	dict_t *xattr = get_new_dict (); +	int32_t version[1] = {1}; + +	if (this_xl == NULL || data == NULL) { +		return ; +	} +   +	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) { +		return ; +	} +   +	subvolume_xl = (xlator_t *) data; +   +	for (i = 0; i < rr->subvolume_count; i++) { +		if (rr->subvolume_list[i].xl == subvolume_xl) { +			subvolume = &rr->subvolume_list[i]; +			break; +		} +	} +   +	switch (event) { +	case GF_EVENT_CHILD_UP: +		/* Seeding, to be done only once */ +		if (rr->first_time && (i == rr->subvolume_count)) { +			loc_t loc = {0,}; +			xlator_t *trav = NULL; + +			pool = this_xl->ctx->pool; +			frame = create_frame (this_xl, pool); +			ret = dict_set_bin (xattr, "trusted.glusterfs.scheduler.rr", +					    version, sizeof (int32_t)); +			if (-1 == ret) { +				gf_log (this_xl->name, GF_LOG_ERROR, "rr seed setting failed"); +			} +			if (xattr) +				dict_ref (xattr); +			 +			loc.path = strdup ("/"); +			for (trav = this_xl->parents->xlator; trav; trav = trav->parents->xlator) { +				if (trav->itable) { +					loc.inode = trav->itable->root; +					break; +				} +			} +			STACK_WIND (frame, +				    rr_notify_cbk, +				    (xlator_t *)data, +				    ((xlator_t *)data)->fops->xattrop, +				    &loc, +				    GF_XATTROP_ADD_ARRAY, +				    xattr); +	   +			if (xattr) +				dict_unref (xattr); + +			rr->first_time = 0; +		} +		if (subvolume) { +			pthread_mutex_lock (&rr->mutex); +			subvolume->status = RR_SUBVOLUME_ONLINE; +			pthread_mutex_unlock (&rr->mutex); +		} +		break; +	case GF_EVENT_CHILD_DOWN: +		if (subvolume) { +			pthread_mutex_lock (&rr->mutex); +			subvolume->status = RR_SUBVOLUME_OFFLINE; +			pthread_mutex_unlock (&rr->mutex); +		} +		break; +	} +   +	return ; +} + +int  +rr_notify_cbk (call_frame_t *frame,  +	       void *cookie,  +	       xlator_t *this_xl,  +	       int32_t op_ret,  +	       int32_t op_errno, +	       dict_t *xattr) +{ +	rr_t *rr = NULL; +	int32_t *index = NULL; +	int32_t ret = -1; +	void *tmp_index_ptr = NULL; + +	if (frame == NULL)  +	{ +		return -1; +	} +   +	if ((this_xl == NULL) || (op_ret == -1)) +	{ +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) +	{ +		STACK_DESTROY (frame->root); +		return -1; +	} +   +	ret = dict_get_bin (xattr, "trusted.glusterfs.scheduler.rr", &tmp_index_ptr); +	index = tmp_index_ptr; +	if (ret == 0) +		rr->schedule_index = (index[0] % rr->subvolume_count); +	else +		rr->schedule_index = 0; + +	STACK_DESTROY (frame->root); +	return 0; +} + +struct sched_ops sched = { +	.init     = rr_init, +	.fini     = rr_fini, +	.update   = rr_update, +	.schedule = rr_schedule, +	.notify   = rr_notify +}; + diff --git a/scheduler/rr/src/rr.h b/scheduler/rr/src/rr.h new file mode 100644 index 00000000000..baa471209e8 --- /dev/null +++ b/scheduler/rr/src/rr.h @@ -0,0 +1,70 @@ +/* +   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RR_H +#define _RR_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include <stdint.h> +#include <sys/time.h> + +struct rr_subvolume +{ +  xlator_t  *xl; +  uint8_t   free_disk_status; +  uint8_t   status; +}; +typedef struct rr_subvolume rr_subvolume_t; + +struct rr +{ +  rr_options_t    options; +  rr_subvolume_t  *subvolume_list; +  uint64_t        subvolume_count; +  uint64_t        schedule_index; +  struct timeval  last_stat_fetched_time; +  pthread_mutex_t mutex; +  char            first_time; +}; +typedef struct rr rr_t; + +int rr_init (xlator_t *this_xl); +void rr_fini (xlator_t *this_xl); +xlator_t *rr_schedule (xlator_t *this_xl, const void *path); +void rr_update (xlator_t *this_xl); +int rr_update_cbk (call_frame_t *frame,  +		   void *cookie,  +		   xlator_t *this_xl,  +		   int32_t op_ret,  +		   int32_t op_errno,  +		   struct xlator_stats *stats); +void rr_notify (xlator_t *this_xl, int32_t event, void *data); +int rr_notify_cbk (call_frame_t *frame,  +		   void *cookie,  +		   xlator_t *this_xl,  +		   int32_t op_ret,  +		   int32_t op_errno, +		   dict_t *xattr); + +#endif /* _RR_H */ diff --git a/scheduler/switch/Makefile.am b/scheduler/switch/Makefile.am new file mode 100644 index 00000000000..d471a3f9243 --- /dev/null +++ b/scheduler/switch/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES =  diff --git a/scheduler/switch/src/Makefile.am b/scheduler/switch/src/Makefile.am new file mode 100644 index 00000000000..dc7d16d40d2 --- /dev/null +++ b/scheduler/switch/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = switch.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +switch_la_LDFLAGS = -module -avoidversion + +switch_la_SOURCES = switch.c +switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/scheduler/switch/src/switch.c b/scheduler/switch/src/switch.c new file mode 100644 index 00000000000..70b3071871d --- /dev/null +++ b/scheduler/switch/src/switch.c @@ -0,0 +1,398 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include <sys/time.h> +#include <stdlib.h> +#include <fnmatch.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "scheduler.h" + +struct switch_sched_array { +	xlator_t *xl; +	int32_t   eligible; +	int32_t   considered; +}; + +/* Select one of this struct based on the path's pattern match */ +struct switch_sched_struct { +	struct switch_sched_struct *next; +	struct switch_sched_array  *array; +	char                        path_pattern[256]; +	int32_t                     node_index; /* Index of the node in  +						   this pattern. */ +	int32_t                     num_child;  /* Total num of child nodes  +						   with this pattern. */ +}; + +struct switch_struct { +	struct switch_sched_struct *cond; +	struct switch_sched_array  *array; +	pthread_mutex_t             switch_mutex; +	int32_t                     child_count; +}; + +/* This function should return child node as '*:subvolumes' is inserterd */ +static xlator_t * +switch_get_matching_xl (const char *path, struct switch_sched_struct *cond) +{ +	struct switch_sched_struct *trav      = cond; +	char                       *pathname  = strdup (path); +	int                         index     = 0; + +	while (trav) { +		if (fnmatch (trav->path_pattern,  +			     pathname, FNM_NOESCAPE) == 0) { +			free (pathname); +			trav->node_index %= trav->num_child; +			index = (trav->node_index++) % trav->num_child; +			return trav->array[index].xl; +		} +		trav = trav->next; +	} +	free (pathname); +	return NULL; +} + + +static int32_t +switch_init (xlator_t *xl) +{ +	int32_t index = 0; +	data_t *data = NULL; +	char *child = NULL; +	char *tmp = NULL; +	char *childs_data = NULL; +	xlator_list_t *trav_xl = xl->children; +	struct switch_struct *switch_buf = NULL; +   +	switch_buf = CALLOC (1, sizeof (struct switch_struct)); +	ERR_ABORT (switch_buf); + +	while (trav_xl) { +		index++; +		trav_xl = trav_xl->next; +	} +	switch_buf->child_count = index; +	switch_buf->array = CALLOC (index + 1,  +				    sizeof (struct switch_sched_struct)); +	ERR_ABORT (switch_buf->array); +	trav_xl = xl->children; +	index = 0; + +	while (trav_xl) { +		switch_buf->array[index].xl = trav_xl->xlator; +		switch_buf->array[index].eligible = 1; +		trav_xl = trav_xl->next; +		index++; +	} + +	data = dict_get (xl->options, "scheduler.read-only-subvolumes"); +	if (data) { +		childs_data = strdup (data->data); +		child = strtok_r (childs_data, ",", &tmp); +		while (child) { +			for (index = 1;  +			     index < switch_buf->child_count; index++) { +				if (strcmp (switch_buf->array[index - 1].xl->name, child) == 0) { +					gf_log ("switch", GF_LOG_DEBUG,  +						"Child '%s' is read-only",  +						child); +					memcpy (&(switch_buf->array[index-1]), +						&(switch_buf->array[switch_buf->child_count - 1]),  +						sizeof (struct switch_sched_array)); +					switch_buf->child_count--; +					break; +				} +			} +			child = strtok_r (NULL, ",", &tmp); +		} +		free (childs_data); +	} + +	data = dict_get (xl->options, "scheduler.local-volume-name"); +	if (data) { +		/* Means, give preference to that node first */ +		gf_log ("switch", GF_LOG_DEBUG,  +			"local volume defined as %s", data->data); + +		/* TODO: parse it properly, have an extra index to  +		   specify that first */ +	} + +	/*  *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ +	data = dict_get (xl->options, "scheduler.switch.case"); +	if (data) { +		char *tmp_str = NULL;  +		char *tmp_str1 = NULL; +		char *dup_str = NULL; +		char *switch_str = NULL; +		char *pattern = NULL; +		char *childs = NULL; +		struct switch_sched_struct *switch_opt = NULL; +		struct switch_sched_struct *trav = NULL; +		/* Get the pattern for considering switch case.  +		   "option block-size *avi:10MB" etc */ +		switch_str = strtok_r (data->data, ";", &tmp_str); +		while (switch_str) { +			dup_str = strdup (switch_str); +			switch_opt =  +				CALLOC (1,  +					sizeof (struct switch_sched_struct)); +			ERR_ABORT (switch_opt); + +			/* Link it to the main structure */ +			if (switch_buf->cond) { +				/* there are already few entries */ +				trav = switch_buf->cond; +				while (trav->next) +					trav = trav->next; +				trav->next = switch_opt; +			} else { +				/* First entry */ +				switch_buf->cond = switch_opt; +			} +			pattern = strtok_r (dup_str, ":", &tmp_str1); +			childs = strtok_r (NULL, ":", &tmp_str1); +			if (strncmp (pattern, "*", 2) == 0) { +				gf_log ("switch", GF_LOG_WARNING, +					"'*' pattern will be taken by default " +					"for all the unconfigured child nodes," +					" hence neglecting current option"); +				switch_str = strtok_r (NULL, ";", &tmp_str); +				free (dup_str); +				continue; +			} +			memcpy (switch_opt->path_pattern,  +				pattern, strlen (pattern)); +			if (childs) { +				int32_t idx = 0; +				char *tmp1 = NULL; +				char *dup_childs = NULL; +				/* TODO: get the list of child nodes for  +				   the given pattern */ +				dup_childs = strdup (childs); +				child = strtok_r (dup_childs, ",", &tmp); +				while (child) { +					idx++; +					child = strtok_r (NULL, ",", &tmp); +				} +				free (dup_childs); +				child = strtok_r (childs, ",", &tmp1); +				switch_opt->num_child = idx; +				switch_opt->array =  +					CALLOC (1, idx * sizeof (struct switch_sched_array)); +				ERR_ABORT (switch_opt->array); +				idx = 0; +				child = strtok_r (childs, ",", &tmp); +				while (child) { +					for (index = 1;  +					     index < switch_buf->child_count;  +					     index++) { +						if (strcmp (switch_buf->array[index - 1].xl->name,  +							    child) == 0) { +							gf_log ("switch",  +								GF_LOG_DEBUG, +								"'%s' pattern will be scheduled to \"%s\"", +								switch_opt->path_pattern, child); +							/* +							  if (switch_buf->array[index-1].considered) { +							  gf_log ("switch", GF_LOG_DEBUG,  +							  "ambiguity found, exiting"); +							  return -1; +							  } +							*/ +							switch_opt->array[idx].xl = switch_buf->array[index-1].xl; +							switch_buf->array[index-1].considered = 1; +							idx++; +							break; +						} +					} +					child = strtok_r (NULL, ",", &tmp1); +				} +			} else { +				/* error */ +				gf_log ("switch", GF_LOG_ERROR,  +					"Check \"scheduler.switch.case\" " +					"option in unify volume. Exiting"); +				free (switch_buf->array); +				free (switch_buf); +				return -1; +			} +			free (dup_str); +			switch_str = strtok_r (NULL, ";", &tmp_str); +		} +	} +	/* Now, all the pattern based considerations done, so for all the  +	 * remaining pattern, '*' to all the remaining child nodes +	 */ +	{ +		struct switch_sched_struct *switch_opt = NULL; +		int32_t flag = 0; +		int32_t index = 0; +		for (index=0; index < switch_buf->child_count; index++) { +			/* check for considered flag */ +			if (switch_buf->array[index].considered) +				continue; +			flag++; +		} +		if (!flag) { +			gf_log ("switch", GF_LOG_ERROR, +				"No nodes left for pattern '*'. Exiting."); +			return -1; +		} +		switch_opt = CALLOC (1, sizeof (struct switch_sched_struct)); +		ERR_ABORT (switch_opt); +		if (switch_buf->cond) { +			/* there are already few entries */ +			struct switch_sched_struct *trav = switch_buf->cond; +			while (trav->next) +				trav = trav->next; +			trav->next = switch_opt; +		} else { +			/* First entry */ +			switch_buf->cond = switch_opt; +		} +		/* Add the '*' pattern to the array */ +		memcpy (switch_opt->path_pattern, "*", 2); +		switch_opt->num_child = flag; +		switch_opt->array =  +			CALLOC (1, flag * sizeof (struct switch_sched_array)); +		ERR_ABORT (switch_opt->array); +		flag = 0; +		for (index=0; index < switch_buf->child_count; index++) { +			/* check for considered flag */ +			if (switch_buf->array[index].considered) +				continue; +			gf_log ("switch", GF_LOG_DEBUG, +				"'%s' pattern will be scheduled to \"%s\"", +				switch_opt->path_pattern,  +				switch_buf->array[index].xl->name); +			switch_opt->array[flag].xl =  +				switch_buf->array[index].xl; +			switch_buf->array[index].considered = 1; +			flag++; +		} +	} + +	pthread_mutex_init (&switch_buf->switch_mutex, NULL); + +	// put it at the proper place +	*((long *)xl->private) = (long)switch_buf;  + +	return 0; +} + +static void +switch_fini (xlator_t *xl) +{ +	/* TODO: free all the allocated entries */ +	struct switch_struct *switch_buf = NULL; +	switch_buf = (struct switch_struct *)*((long *)xl->private); + +	pthread_mutex_destroy (&switch_buf->switch_mutex); +	free (switch_buf->array); +	free (switch_buf); +} + +static xlator_t * +switch_schedule (xlator_t *xl, const void *path) +{ +	struct switch_struct *switch_buf = NULL; +	switch_buf = (struct switch_struct *)*((long *)xl->private); + +	return switch_get_matching_xl (path, switch_buf->cond); +} + + +/** + * notify + */ +void +switch_notify (xlator_t *xl, int32_t event, void *data) +{ +	/* TODO: This should be checking in switch_sched_struct */ +#if 0 +	struct switch_struct *switch_buf = NULL; +	int32_t idx = 0; + +	switch_buf = (struct switch_struct *)*((long *)xl->private); +	if (!switch_buf) +		return; + +	for (idx = 0; idx < switch_buf->child_count; idx++) { +		if (switch_buf->array[idx].xl == (xlator_t *)data) +			break; +	} + +	switch (event) +	{ +	case GF_EVENT_CHILD_UP: +	{ +		switch_buf->array[idx].eligible = 1; +	} +	break; +	case GF_EVENT_CHILD_DOWN: +	{ +		switch_buf->array[idx].eligible = 0; +	} +	break; +	default: +	{ +		; +	} +	break; +	} +#endif +} + +static void  +switch_update (xlator_t *xl) +{ +	return; +} + +struct sched_ops sched = { +	.init     = switch_init, +	.fini     = switch_fini, +	.update   = switch_update, +	.schedule = switch_schedule, +	.notify   = switch_notify +}; + +struct volume_options options[] = { +	{ .key   = { "scheduler.read-only-subvolumes" , +		     "switch.read-only-subvolumes"},   +	  .type  = GF_OPTION_TYPE_ANY +	}, +	{ .key   = { "scheduler.local-volume-name", +		     "switch.nufa.local-volume-name" }, +	  .type  = GF_OPTION_TYPE_XLATOR +	}, +	{ .key   = { "scheduler.switch.case", +		     "switch.case" }, +	  .type  = GF_OPTION_TYPE_ANY +	}, +	{ .key = {NULL} } +};  | 
