diff options
author | Jeff Darcy <jdarcy@fb.com> | 2017-09-15 06:59:01 -0700 |
---|---|---|
committer | Jeff Darcy <jdarcy@fb.com> | 2017-09-15 13:47:01 -0700 |
commit | 8dfdecf220d1c9365e1f8d6af9ead5e48c61e2eb (patch) | |
tree | bccd5906be43cf81792248b06099006525ed0c27 | |
parent | e4b47b5d54644c398c424a99116a0cc37e4431d6 (diff) |
Replace namespace/io-stats/io-threads with 3.6-fb versions
This rolls up multiple patches related to namespace identificaton and
throttling/QoS. This primarily includes the following, all by Michael
Goulet <mgoulet@fb.com>.
io-threads: Add weighted round robin queueing by namespace
https://phabricator.facebook.com/D5615269
io-threads: Add per-namespaces queue sizes to IO_THREADS_QUEUE_SIZE_KEY
https://phabricator.facebook.com/D5683162
io-threads: Implement better slot allocation algorithm
https://phabricator.facebook.com/D5683186
io-threads: Only enable weighted queueing on bricks
https://phabricator.facebook.com/D5700062
io-threads: Update queue sizes on drain
https://phabricator.facebook.com/D5704832
Fix parsing (-1) as default NS weight
https://phabricator.facebook.com/D5723383
Parts of the following patches have also been applied to satisfy
dependencies.
io-throttling: Calculate moving averages and throttle offending hosts
https://phabricator.fb.com/D2516161
Shreyas Siravara <sshreyas@fb.com>
Hook up ODS logging for FUSE clients.
https://phabricator.facebook.com/D3963376
Kevin Vigor <kvigor@fb.com>
Add the flag --skip-nfsd-start to skip the NFS daemon stating, even if
it is enabled
https://phabricator.facebook.com/D4575368
Alex Lorca <alexlorca@fb.com>
There are also some "standard" changes: dealing with code that moved,
reindenting to comply with Gluster coding standards, gf_uuid_xxx, etc.
This patch *does* revert some changes which have occurred upstream since
3.6; these will be re-applied as apppropriate on top of this new base.
Change-Id: I69024115da7a60811e5b86beae781d602bdb558d
Signed-off-by: Jeff Darcy <jdarcy@fb.com>
-rw-r--r-- | glusterfsd/src/glusterfsd.c | 8 | ||||
-rw-r--r-- | glusterfsd/src/glusterfsd.h | 1 | ||||
-rw-r--r-- | libglusterfs/src/common-utils.c | 28 | ||||
-rw-r--r-- | libglusterfs/src/dict.c | 42 | ||||
-rw-r--r-- | libglusterfs/src/dict.h | 16 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 11 | ||||
-rw-r--r-- | libglusterfs/src/stack.c | 1 | ||||
-rw-r--r-- | libglusterfs/src/stack.h | 15 | ||||
-rw-r--r-- | tests/basic/afr/durability-off.t | 12 | ||||
-rw-r--r-- | tests/basic/fop-sampling.t | 4 | ||||
-rw-r--r-- | tests/basic/stats-dump.t | 4 | ||||
-rw-r--r-- | xlators/debug/io-stats/src/Makefile.am | 6 | ||||
-rw-r--r-- | xlators/debug/io-stats/src/io-stats-mem-types.h | 2 | ||||
-rw-r--r-- | xlators/debug/io-stats/src/io-stats.c | 2487 | ||||
-rw-r--r-- | xlators/debug/io-stats/src/io-stats.h | 381 | ||||
-rw-r--r-- | xlators/features/namespace/src/namespace.c | 77 | ||||
-rw-r--r-- | xlators/features/namespace/src/namespace.h | 1 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 2 | ||||
-rw-r--r-- | xlators/performance/io-threads/src/Makefile.am | 9 | ||||
-rw-r--r-- | xlators/performance/io-threads/src/io-threads.c | 1246 | ||||
-rw-r--r-- | xlators/performance/io-threads/src/io-threads.h | 66 |
21 files changed, 3057 insertions, 1362 deletions
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c index 5022cfc22da..a0fec8f49a1 100644 --- a/glusterfsd/src/glusterfsd.c +++ b/glusterfsd/src/glusterfsd.c @@ -129,6 +129,10 @@ static struct argp_option gf_options[] = { "buffer size, [default: 5]"}, {"log-flush-timeout", ARGP_LOG_FLUSH_TIMEOUT, "LOG-FLUSH-TIMEOUT", 0, "Set log flush timeout, [default: 2 minutes]"}, + {"stats-instance-name", ARGP_STATS_INSTANCE_NAME, + "STATS-INSTANCE-NAME", 0, + "Specify instance name for io-stats translator"}, + {0, 0, 0, 0, "Advanced Options:"}, {"volfile-server-port", ARGP_VOLFILE_SERVER_PORT_KEY, "PORT", 0, @@ -1246,6 +1250,10 @@ no_oom_api: break; + case ARGP_STATS_INSTANCE_NAME: + cmd_args->stats_instance_name = gf_strdup (arg); + break; + case ARGP_LOG_FLUSH_TIMEOUT: if (gf_string2uint32 (arg, &cmd_args->log_flush_timeout)) { argp_failure (state, -1, 0, diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h index b5c6b27b534..940b351e552 100644 --- a/glusterfsd/src/glusterfsd.h +++ b/glusterfsd/src/glusterfsd.h @@ -96,6 +96,7 @@ enum argp_option_keys { #ifdef GF_LINUX_HOST_OS ARGP_OOM_SCORE_ADJ_KEY = 176, #endif + ARGP_STATS_INSTANCE_NAME = 177, }; struct _gfd_vol_top_priv_t { diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 84a0785d660..62c41a767d0 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -4349,9 +4349,11 @@ list_node_del (struct list_node *node) GF_FREE (node); } -const char * -fop_enum_to_pri_string (glusterfs_fop_t fop) +iot_pri_t +iot_fop_to_pri (glusterfs_fop_t fop) { + iot_pri_t pri = IOT_PRI_MAX - 1; + switch (fop) { case GF_FOP_OPEN: case GF_FOP_STAT: @@ -4365,7 +4367,8 @@ fop_enum_to_pri_string (glusterfs_fop_t fop) case GF_FOP_READDIRP: case GF_FOP_GETACTIVELK: case GF_FOP_SETACTIVELK: - return "HIGH"; + pri = IOT_PRI_HI; + break; case GF_FOP_CREATE: case GF_FOP_FLUSH: @@ -4391,7 +4394,8 @@ fop_enum_to_pri_string (glusterfs_fop_t fop) case GF_FOP_FREMOVEXATTR: case GF_FOP_IPC: case GF_FOP_LEASE: - return "NORMAL"; + pri = IOT_PRI_NORMAL; + break; case GF_FOP_READ: case GF_FOP_WRITE: @@ -4402,22 +4406,26 @@ fop_enum_to_pri_string (glusterfs_fop_t fop) case GF_FOP_XATTROP: case GF_FOP_FXATTROP: case GF_FOP_RCHECKSUM: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: case GF_FOP_ZEROFILL: - case GF_FOP_FALLOCATE: case GF_FOP_SEEK: - return "LOW"; + pri = IOT_PRI_LO; + break; case GF_FOP_NULL: case GF_FOP_FORGET: case GF_FOP_RELEASE: case GF_FOP_RELEASEDIR: case GF_FOP_GETSPEC: + case GF_FOP_COMPOUND: case GF_FOP_MAXVALUE: - case GF_FOP_DISCARD: - return "LEAST"; - default: - return "UNKNOWN"; + //fail compilation on missing fop + //new fop must choose priority. + break; } + + return pri; } const char * diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 6a61e641e19..82b9236e661 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -1258,6 +1258,38 @@ dict_foreach (dict_t *dict, return ret; } +int +dict_foreach_with_idx (dict_t *dict, + int (*fn)(dict_t *this, + char *key, + data_t *value, + void *data, uint64_t idx), + void *data) +{ + if (!dict) { + gf_log_callingfn ("dict", GF_LOG_WARNING, + "dict is NULL"); + return -1; + } + + uint64_t idx = 0; + int ret = -1; + data_pair_t *pairs = NULL; + data_pair_t *next = NULL; + + pairs = dict->members_list; + while (pairs) { + next = pairs->next; + ret = fn (dict, pairs->key, pairs->value, data, idx); + if (ret < 0) + return ret; + pairs = next; + idx++; + } + + return 0; +} + /* return values: -1 = failure, 0 = no matches found, @@ -2979,6 +3011,16 @@ dict_dump_to_str (dict_t *dict, char *dump, int dumpsize, char *format) return 0; } +/* This function converts a uint32 to a (string) key for use in a dictionary. + * Ensure that the key string buffer is at least DICT_UINT32_KEY_SIZE in + * length, since that's the maximum length of a uint32's string representation + * plus a NULL delimiter char. */ +void +dict_uint32_to_key (uint32_t num, char *key_buf) +{ + snprintf (key_buf, DICT_UINT32_KEY_SIZE, "%u", num); +} + void dict_dump_to_log (dict_t *dict) { diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index 5259c6befa1..da95bc86bec 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -22,7 +22,6 @@ typedef struct _data data_t; typedef struct _dict dict_t; typedef struct _data_pair data_pair_t; - #define GF_PROTOCOL_DICT_SERIALIZE(this,from_dict,to,len,ope,labl) do { \ int ret = 0; \ \ @@ -161,6 +160,8 @@ dict_t *get_new_dict (); #define dict_for_each(d, c) for (c = d->members_list; c; c = c->next) +#define dict_foreach_inline(d, c) for (c = d->members_list; c; c = c->next) + int dict_foreach (dict_t *this, int (*fn)(dict_t *this, char *key, @@ -168,6 +169,13 @@ int dict_foreach (dict_t *this, void *data), void *data); +int dict_foreach_with_idx (dict_t *this, + int (*fn)(dict_t *this, + char *key, + data_t *value, + void *data, uint64_t idx), + void *data); + int dict_foreach_fnmatch (dict_t *dict, char *pattern, int (*fn)(dict_t *this, char *key, @@ -246,6 +254,12 @@ GF_MUST_CHECK int dict_get_str (dict_t *this, char *key, char **str); GF_MUST_CHECK int dict_get_str_boolean (dict_t *this, char *key, int default_val); GF_MUST_CHECK int dict_serialize_value_with_delim (dict_t *this, char *buf, int32_t *serz_len, char delimiter); +/* Log_10(2^32) + 1. This is the length of the longest string representation of + * a 32-bit integer, plus space for '\0'. */ +#define DICT_UINT32_KEY_SIZE 11 + +void dict_uint32_to_key (uint32_t num, char *key_buf); + void dict_dump_to_statedump (dict_t *dict, char *dict_name, char *domain); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 0b033d8bfcf..f618c7aba19 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -331,7 +331,14 @@ static inline const char *fop_pri_to_string (gf_fop_pri_t pri) return FOP_PRI_STRINGS[pri]; } -const char *fop_enum_to_pri_string (glusterfs_fop_t fop); +iot_pri_t iot_fop_to_pri (glusterfs_fop_t fop); + +static inline const char *fop_enum_to_pri_string (glusterfs_fop_t fop) +{ + iot_pri_t pri = iot_fop_to_pri (fop); + return fop_pri_to_string (pri); +} + const char *fop_enum_to_string (glusterfs_fop_t fop); #define GF_SET_IF_NOT_PRESENT 0x1 /* default behaviour */ @@ -444,6 +451,8 @@ struct _cmd_args { #ifdef GF_LINUX_HOST_OS char *oom_score_adj; #endif + + char *stats_instance_name; }; typedef struct _cmd_args cmd_args_t; diff --git a/libglusterfs/src/stack.c b/libglusterfs/src/stack.c index 6977814ec69..adcedd4cc96 100644 --- a/libglusterfs/src/stack.c +++ b/libglusterfs/src/stack.c @@ -36,6 +36,7 @@ create_frame (xlator_t *xl, call_pool_t *pool) frame->root = stack; frame->this = xl; + frame->pri = GF_FOP_PRI_UNSPEC; LOCK_INIT (&frame->lock); INIT_LIST_HEAD (&frame->frames); list_add (&frame->frames, &stack->myframes); diff --git a/libglusterfs/src/stack.h b/libglusterfs/src/stack.h index 9e5355a6044..094dc62312c 100644 --- a/libglusterfs/src/stack.h +++ b/libglusterfs/src/stack.h @@ -77,6 +77,8 @@ struct _call_frame_t { const char *wind_to; const char *unwind_from; const char *unwind_to; + + gf_fop_pri_t pri; }; struct _ns_info { @@ -122,6 +124,16 @@ struct _call_stack_t { ns_info_t ns_info; }; +#define frame_set_throttling(frm, should_throttle) \ + do { \ + if (frm) { \ + if (should_throttle) { \ + frm->pri = IOT_PRI_LEAST; \ + } else { \ + frm->pri = IOT_PRI_UNSPEC; \ + } \ + } \ + } while (0) #define frame_set_uid_gid(frm, u, g) \ do { \ @@ -259,6 +271,7 @@ STACK_RESET (call_stack_t *stack) _new->wind_from = __FUNCTION__; \ _new->wind_to = #fn; \ _new->unwind_to = #rfn; \ + _new->pri = frame->pri; \ \ LOCK_INIT (&_new->lock); \ LOCK(&frame->root->stack_lock); \ @@ -321,6 +334,8 @@ STACK_RESET (call_stack_t *stack) _new->wind_from = __FUNCTION__; \ _new->wind_to = #fn; \ _new->unwind_to = #rfn; \ + _new->pri = frame->pri; \ + \ LOCK_INIT (&_new->lock); \ LOCK(&frame->root->stack_lock); \ { \ diff --git a/tests/basic/afr/durability-off.t b/tests/basic/afr/durability-off.t index 155ffa09ef0..0c4f470079a 100644 --- a/tests/basic/afr/durability-off.t +++ b/tests/basic/afr/durability-off.t @@ -5,6 +5,14 @@ . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc +did_fsync () { + local count=$($CLI volume profile $V0 info | grep -w FSYNC | wc -l) + if [ "$count" != "0" ]; then + echo "Y" + else + echo "N" + fi +} cleanup; TEST glusterd @@ -24,7 +32,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 TEST $CLI volume heal $V0 EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 -EXPECT "^0$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l) +EXPECT "N" did_fsync #Test that fsyncs happen when durability is on TEST $CLI volume set $V0 cluster.ensure-durability on @@ -39,6 +47,6 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 TEST $CLI volume heal $V0 EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 -EXPECT "^2$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l) +EXPECT "Y" did_fsync cleanup; diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t index e429fd8cb07..a1b3edc3d5d 100644 --- a/tests/basic/fop-sampling.t +++ b/tests/basic/fop-sampling.t @@ -13,7 +13,7 @@ function check_path { op=$1 path=$2 file=$3 - grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null + grep $op $file | awk -F, '{print $12}' | grep $path 2>&1 > /dev/null if [ $? -eq 0 ]; then echo "Y" else @@ -106,6 +106,8 @@ for dir in "$N0" "$M0"; do rm $dir/file2 done; +read -p "Continue? " nothing + EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES diff --git a/tests/basic/stats-dump.t b/tests/basic/stats-dump.t index af1ad34702b..a188a45eea1 100644 --- a/tests/basic/stats-dump.t +++ b/tests/basic/stats-dump.t @@ -39,7 +39,9 @@ FUSE_RET="$?" # Test that io-stats is getting queue sizes from io-threads TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfs_nfsd_$V0.dump -TEST ! grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump + +# We should be getting queue sizes on bricks now too. +TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump TEST [ 0 -ne "$BRICK_RET" ] TEST [ 0 -ne "$NFSD_RET" ] diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am index c5df598549a..5d39afc2a14 100644 --- a/xlators/debug/io-stats/src/Makefile.am +++ b/xlators/debug/io-stats/src/Makefile.am @@ -2,17 +2,17 @@ xlator_LTLIBRARIES = io-stats.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug -io_stats_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +io_stats_la_LDFLAGS = -module -avoid-version io_stats_la_SOURCES = io-stats.c io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-stats-mem-types.h +noinst_HEADERS = io-stats-mem-types.h io-stats.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src \ -I$(top_srcdir)/rpc/rpc-lib/src \ - -DDATADIR=\"$(localstatedir)\" + -DGLUSTERD_WORKDIR=$(GLUSTERD_WORKDIR) AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/debug/io-stats/src/io-stats-mem-types.h b/xlators/debug/io-stats/src/io-stats-mem-types.h index 9dde9373264..af1e59abc2c 100644 --- a/xlators/debug/io-stats/src/io-stats-mem-types.h +++ b/xlators/debug/io-stats/src/io-stats-mem-types.h @@ -13,8 +13,6 @@ #include "mem-types.h" -extern const char *__progname; - enum gf_io_stats_mem_types_ { gf_io_stats_mt_ios_conf = gf_common_mt_end + 1, gf_io_stats_mt_ios_fd, diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index e3511233203..67edd36e611 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -7,8 +7,11 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" #include "xlator.h" -#include "syscall.h" +#endif /** * xlators/debug/io_stats : @@ -35,365 +38,54 @@ #include "logging.h" #include "cli1-xdr.h" #include "statedump.h" +#include "io-stats.h" #include "syncop.h" +#include "hashfn.h" #include <pwd.h> #include <grp.h> -#define MAX_LIST_MEMBERS 100 -#define DEFAULT_PWD_BUF_SZ 16384 -#define DEFAULT_GRP_BUF_SZ 16384 -#define IOS_MAX_ERRORS 132 - -typedef enum { - IOS_STATS_TYPE_NONE, - IOS_STATS_TYPE_OPEN, - IOS_STATS_TYPE_READ, - IOS_STATS_TYPE_WRITE, - IOS_STATS_TYPE_OPENDIR, - IOS_STATS_TYPE_READDIRP, - IOS_STATS_TYPE_READ_THROUGHPUT, - IOS_STATS_TYPE_WRITE_THROUGHPUT, - IOS_STATS_TYPE_MAX -}ios_stats_type_t; - -typedef enum { - IOS_STATS_THRU_READ, - IOS_STATS_THRU_WRITE, - IOS_STATS_THRU_MAX, -}ios_stats_thru_t; - -struct ios_stat_lat { - struct timeval time; - double throughput; -}; - -struct ios_stat { - gf_lock_t lock; - uuid_t gfid; - char *filename; - uint64_t counters [IOS_STATS_TYPE_MAX]; - struct ios_stat_lat thru_counters [IOS_STATS_THRU_MAX]; - int refcnt; -}; - -struct ios_stat_list { - struct list_head list; - struct ios_stat *iosstat; - double value; -}; - -struct ios_stat_head { - gf_lock_t lock; - double min_cnt; - uint64_t members; - struct ios_stat_list *iosstats; -}; - -typedef struct _ios_sample_t { - uid_t uid; - gid_t gid; - char identifier[UNIX_PATH_MAX]; - char path[UNIX_PATH_MAX]; - glusterfs_fop_t fop_type; - struct timeval timestamp; - double elapsed; - gf_boolean_t have_path; - int32_t op_ret; - int32_t op_errno; -} ios_sample_t; - - -typedef struct _ios_sample_buf_t { - uint64_t pos; /* Position in write buffer */ - uint64_t size; /* Size of ring buffer */ - uint64_t collected; /* Number of samples we've collected */ - uint64_t observed; /* Number of FOPs we've observed */ - ios_sample_t *ios_samples; /* Our list of samples */ -} ios_sample_buf_t; - - -struct ios_lat { - double min; - double max; - double avg; - uint64_t total; -}; - -struct ios_global_stats { - uint64_t data_written; - uint64_t data_read; - uint64_t block_count_write[32]; - uint64_t block_count_read[32]; - uint64_t fop_hits[GF_FOP_MAXVALUE]; - struct timeval started_at; - struct ios_lat latency[GF_FOP_MAXVALUE]; - uint64_t errno_count[IOS_MAX_ERRORS]; - uint64_t nr_opens; - uint64_t max_nr_opens; - struct timeval max_openfd_time; -}; - -/* This is a list of errors which are in some way critical. - * It is useful to sample these errors even if other errors - * should be ignored. */ -const int32_t ios_hard_error_list[] = { - EIO, - EROFS, - ENOSPC, - ENOTCONN, - ESTALE, -}; - -#define IOS_HARD_ERROR_LIST_SIZE (sizeof(ios_hard_error_list) / sizeof(int32_t)) - -const char *errno_to_name[IOS_MAX_ERRORS] = { - "success", /* 0 */ - "eperm", - "enoent", - "esrch", - "eintr", - "eio", - "enxio", - "e2big", - "enoexec", - "ebadf", - "echild", - "eagain", - "enomem", - "eacces", - "efault", - "enotblk", - "ebusy", - "eexist", - "exdev", - "enodev", - "enotdir", - "eisdir", - "einval", - "enfile", - "emfile", - "enotty", - "etxtbsy", - "efbig", - "enospc", - "espipe", - "erofs", - "emlink", - "epipe", - "edom", - "erange", - "edeadlk", - "enametoolong", - "enolck", - "enosys", - "enotempty", - "eloop", - "ewouldblock", - "enomsg", - "eidrm", - "echrng", - "el2nsync", - "el3hlt", - "el3rst", - "elnrng", - "eunatch", - "enocsi", - "el2hlt", - "ebade", - "ebadr", - "exfull", - "enoano", - "ebadrqc", - "ebadslt", - "edeadlock", - "ebfont", - "enostr", - "enodata", - "etime", - "enosr", - "enonet", - "enopkg", - "eremote", - "enolink", - "eadv", - "esrmnt", - "ecomm", - "eproto", - "emultihop", - "edotdot", - "ebadmsg", - "eoverflow", - "enotuniq", - "ebadfd", - "eremchg", - "elibacc", - "elibbad", - "elibscn", - "elibmax", - "elibexec", - "eilseq", - "erestart", - "estrpipe", - "eusers", - "enotsock", - "edestaddrreq", - "emsgsize", - "eprototype", - "enoprotoopt", - "eprotonosupport", - "esocktnosupport", - "eopnotsupp", - "epfnosupport", - "eafnosupport", - "eaddrinuse", - "eaddrnotavail", - "enetdown", - "enetunreach", - "enetreset", - "econnaborted", - "econnreset", - "enobufs", - "eisconn", - "enotconn", - "eshutdown", - "etoomanyrefs", - "etimedout", - "econnrefused", - "ehostdown", - "ehostunreach", - "ealready", - "einprogress", - "estale", - "euclean", - "enotnam", - "enavail", - "eisnam", - "eremoteio", - "edquot", - "enomedium", - "emediumtype", - "ecanceled", - "enokey", - "ekeyexpired", - "ekeyrevoked", - "ekeyrejected", - "eownerdead", - "enotrecoverable" -}; - -struct ios_conf { - gf_lock_t lock; - struct ios_global_stats cumulative; - uint64_t increment; - struct ios_global_stats incremental; - gf_boolean_t dump_fd_stats; - gf_boolean_t count_fop_hits; - gf_boolean_t measure_latency; - struct ios_stat_head list[IOS_STATS_TYPE_MAX]; - struct ios_stat_head thru_list[IOS_STATS_THRU_MAX]; - int32_t ios_dump_interval; - pthread_t dump_thread; - gf_boolean_t dump_thread_should_die; - gf_lock_t ios_sampling_lock; - int32_t ios_sample_interval; - int32_t ios_sample_buf_size; - ios_sample_buf_t *ios_sample_buf; - struct dnscache *dnscache; - int32_t ios_dnscache_ttl_sec; - gf_boolean_t iamshd; - gf_boolean_t iamnfsd; - gf_boolean_t iambrickd; - gf_boolean_t iamgfproxyd; - gf_boolean_t audit_creates_and_unlinks; - gf_boolean_t sample_hard_errors; - gf_boolean_t sample_all_errors; - uint32_t outstanding_req; - gf_boolean_t dump_percentile_latencies; -}; - - -struct ios_fd { - char *filename; - uint64_t data_written; - uint64_t data_read; - uint64_t block_count_write[32]; - uint64_t block_count_read[32]; - struct timeval opened_at; -}; +const char *get_ns_from_hash (struct ios_conf *conf, const ns_info_t *info); +int ios_conf_load_namespace_conf (struct ios_conf *conf); -typedef enum { - IOS_DUMP_TYPE_NONE = 0, - IOS_DUMP_TYPE_FILE = 1, - IOS_DUMP_TYPE_DICT = 2, - IOS_DUMP_TYPE_JSON_FILE = 3, - IOS_DUMP_TYPE_SAMPLES = 4, - IOS_DUMP_TYPE_MAX = 5 -} ios_dump_type_t; - -struct ios_dump_args { - ios_dump_type_t type; - union { - FILE *logfp; - dict_t *dict; - } u; -}; +gf_boolean_t should_throttle_fop (call_frame_t *frame, xlator_t *this); -typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*, - int , int , uint64_t ) ; +int _ios_swap_sample_buf (xlator_t *this, ios_sample_buf_t **dest); -struct ios_local { - inode_t *inode; - loc_t loc; - fd_t *fd; -}; - -static struct ios_local * -ios_local_new() { - return GF_CALLOC (1, sizeof (struct ios_local), - gf_common_mt_char); -} - -static void -ios_local_free (struct ios_local *local) +/* A lot of io-stats logging code depends on the io-stats translator needing + * the top xlator's name, which is special. For example, on bricks, the name + * is the path to the backend storage directory. + * + * Fails gracefully if there is no xlator name, returning "Unknown". */ +const char * +get_top_xlator_name (xlator_t *this) { - if (!local) - return; - - inode_unref (local->inode); + const char *name = NULL; - if (local->fd) - fd_unref (local->fd); - - loc_wipe (&local->loc); - memset (local, 0, sizeof (*local)); - GF_FREE (local); -} + while (this->parents && this->parents->xlator) { + this = this->parents->xlator; + } -struct volume_options options[]; + this = FIRST_CHILD (this); -static int -is_fop_latency_started (call_frame_t *frame) -{ - GF_ASSERT (frame); - struct timeval epoch = {0,}; - return memcmp (&frame->begin, &epoch, sizeof (epoch)); + if (this && this->name) { + return this->name; + } else { + gf_log (GF_IO_STATS, GF_LOG_ERROR, + "Cannot retrieve top-of-stack translator name."); + return "Unknown"; + } } -static void -ios_free_local (call_frame_t *frame) +static void ios_free_local (call_frame_t *frame) { struct ios_local *local = frame->local; - ios_local_free (local); - frame->local = NULL; } -static void -ios_track_loc (call_frame_t *frame, loc_t *loc) +static void ios_track_loc (call_frame_t *frame, loc_t *loc) { struct ios_local *local = NULL; - if (loc && loc->path) { /* Check if frame->local is already set (it should * only be set by either ios_track_loc() or @@ -412,11 +104,9 @@ ios_track_loc (call_frame_t *frame, loc_t *loc) } } -static void -ios_track_fd (call_frame_t *frame, fd_t *fd) +static void ios_track_fd (call_frame_t *frame, fd_t *fd) { struct ios_local *local = NULL; - if (fd && fd->inode) { if (frame->local) { local = frame->local; @@ -429,14 +119,21 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) } } - -#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples" -#ifdef GF_LINUX_HOST_OS -#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats" +#ifdef HAVE_ATOMIC_BUILTINS +#define INC_COUNTER(counter, inc, lock) \ + (__sync_fetch_and_add (&(counter), (inc))) #else -#define _IOS_DUMP_DIR DATADIR "/db/glusterd/stats" +#define INC_COUNTER(counter, inc, lock) \ + do { \ + LOCK (&(lock)); \ + { \ + counter += (inc); \ + } \ + UNLOCK (&(lock)); \ + } while (0) #endif +#define FOP_THROTTLE(frame, this) frame_set_throttling (frame, should_throttle_fop (frame, this)) #define END_FOP_LATENCY(frame, op) \ do { \ @@ -455,7 +152,7 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) \ conf = this->private; \ if (conf && conf->measure_latency) { \ - STATS_INC (conf->outstanding_req); \ + INC_COUNTER (conf->outstanding_req, 1, conf->lock);\ gettimeofday (&frame->begin, NULL); \ } else { \ memset (&frame->begin, 0, sizeof (frame->begin));\ @@ -474,24 +171,30 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) conf->incremental.fop_hits[GF_FOP_##op]++; \ } while (0) - -#if defined(HAVE_ATOMIC_BUILTINS) -#define STATS_LOCK(x) -#define STATS_UNLOCK(x) -#define STATS_ADD(x,i) __sync_add_and_fetch (&x, i) -#define STATS_SUB(x,i) __sync_sub_and_fetch (&x, i) -#define STATS_INC(x) STATS_ADD(x, 1) -#define STATS_DEC(x) STATS_SUB(x, 1) +#ifdef HAVE_ATOMIC_BUILTINS +#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \ + do { \ + struct ios_conf *conf = NULL; \ + \ + if (!is_fop_latency_started (frame)) \ + break; \ + conf = this->private; \ + if (conf && conf->measure_latency && \ + conf->count_fop_hits) { \ + BUMP_FOP (op); \ + __sync_fetch_and_sub (&conf->outstanding_req, 1); \ + if (op_ret != 0 && op_errno > 0 \ + && op_errno < IOS_MAX_ERRORS) { \ + __sync_fetch_and_add ( \ + &conf->cumulative.errno_count[op_errno], 1); \ + __sync_fetch_and_add ( \ + &conf->incremental.errno_count[op_errno], 1); \ + } \ + gettimeofday (&frame->end, NULL); \ + update_ios_latency (conf, frame, GF_FOP_##op, op_ret, op_errno); \ + } \ + } while (0) #else -#define STATS_LOCK(x) LOCK (x) -#define STATS_UNLOCK(x) UNLOCK (x) -#define STATS_ADD(x,i) (x) += (i) -#define STATS_SUB(x,i) (x) -= (i) -#define STATS_INC(x) (x) += 1 -#define STATS_DEC(x) (x) -= 1 -#endif - - #define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \ do { \ struct ios_conf *conf = NULL; \ @@ -499,94 +202,200 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) if (!is_fop_latency_started (frame)) \ break; \ conf = this->private; \ - STATS_LOCK (&conf->lock); \ + LOCK (&conf->lock); \ { \ if (conf && conf->measure_latency && \ conf->count_fop_hits) { \ BUMP_FOP(op); \ - STATS_DEC (conf->outstanding_req); \ + conf->outstanding_req -= 1; \ if (op_ret != 0 && op_errno > 0 \ && op_errno < IOS_MAX_ERRORS) { \ - STATS_INC(conf->cumulative.errno_count[op_errno]); \ - STATS_INC(conf->incremental.errno_count[op_errno]); \ + conf->cumulative.errno_count[op_errno]++; \ + conf->incremental.errno_count[op_errno]++; \ } \ gettimeofday (&frame->end, NULL); \ - update_ios_latency (conf, frame, GF_FOP_##op, \ - op_ret, op_errno); \ + update_ios_latency (conf, frame, GF_FOP_##op, op_ret, op_errno);\ } \ } \ - STATS_UNLOCK (&conf->lock); \ + UNLOCK (&conf->lock); \ } while (0) +#endif -#define BUMP_READ(fd, len) \ - do { \ - struct ios_conf *conf = NULL; \ - struct ios_fd *iosfd = NULL; \ - int lb2 = 0; \ - \ - conf = this->private; \ - lb2 = log_base2 (len); \ - ios_fd_ctx_get (fd, this, &iosfd); \ - if (!conf) \ - break; \ - \ - STATS_LOCK (&conf->lock); \ - { \ - STATS_ADD (conf->cumulative.data_read, len); \ - STATS_ADD (conf->incremental.data_read, len); \ - STATS_ADD (conf->cumulative.block_count_read[lb2], 1); \ - STATS_ADD (conf->incremental.block_count_read[lb2], 1);\ - \ - if (iosfd) { \ - STATS_ADD (iosfd->data_read, len); \ - STATS_ADD (iosfd->block_count_read[lb2], 1); \ - } \ - } \ - STATS_UNLOCK (&conf->lock); \ +#ifdef HAVE_ATOMIC_BUILTINS +#define BUMP_READ(fd, len) \ + do { \ + struct ios_conf *conf = NULL; \ + struct ios_fd *iosfd = NULL; \ + int lb2 = 0; \ + \ + conf = this->private; \ + lb2 = log_base2 (len); \ + ios_fd_ctx_get (fd, this, &iosfd); \ + if (!conf) \ + break; \ + \ + __sync_fetch_and_add (&conf->cumulative.data_read, \ + len); \ + __sync_fetch_and_add (&conf->incremental.data_read, \ + len); \ + __sync_fetch_and_add ( \ + &conf->cumulative.block_count_read[lb2], 1); \ + __sync_fetch_and_add ( \ + &conf->incremental.block_count_read[lb2], 1); \ + if (iosfd) { \ + __sync_fetch_and_add (&iosfd->data_read, len); \ + __sync_fetch_and_add ( \ + &iosfd->block_count_read[lb2], 1); \ + } \ } while (0) - -#define BUMP_WRITE(fd, len) \ - do { \ - struct ios_conf *conf = NULL; \ - struct ios_fd *iosfd = NULL; \ - int lb2 = 0; \ - \ - conf = this->private; \ - lb2 = log_base2 (len); \ - ios_fd_ctx_get (fd, this, &iosfd); \ - if (!conf) \ - break; \ - STATS_LOCK (&conf->lock); \ - { \ - STATS_ADD (conf->cumulative.data_written, len); \ - STATS_ADD (conf->incremental.data_written, len); \ - STATS_ADD (conf->cumulative.block_count_write[lb2], 1);\ - STATS_ADD (conf->incremental.block_count_write[lb2], 1);\ - \ - if (iosfd) { \ - STATS_ADD (iosfd->data_written, len); \ - STATS_ADD (iosfd->block_count_write[lb2], 1); \ - } \ - } \ - STATS_UNLOCK (&conf->lock); \ +#else +#define BUMP_READ(fd, len) \ + do { \ + struct ios_conf *conf = NULL; \ + struct ios_fd *iosfd = NULL; \ + int lb2 = 0; \ + \ + conf = this->private; \ + lb2 = log_base2 (len); \ + ios_fd_ctx_get (fd, this, &iosfd); \ + if (!conf) \ + break; \ + \ + LOCK (&conf->lock); \ + { \ + conf->cumulative.data_read += len; \ + conf->incremental.data_read += len; \ + conf->cumulative.block_count_read[lb2]++; \ + conf->incremental.block_count_read[lb2]++; \ + \ + if (iosfd) { \ + iosfd->data_read += len; \ + iosfd->block_count_read[lb2]++; \ + } \ + } \ + UNLOCK (&conf->lock); \ } while (0) +#endif -#define BUMP_STATS(iosstat, type) \ +#ifdef HAVE_ATOMIC_BUILTINS +#define BUMP_WRITE(fd, len) \ do { \ - struct ios_conf *conf = NULL; \ - uint64_t value = 0; \ + struct ios_conf *conf = NULL; \ + struct ios_fd *iosfd = NULL; \ + int lb2 = 0; \ \ conf = this->private; \ + lb2 = log_base2 (len); \ + ios_fd_ctx_get (fd, this, &iosfd); \ + if (!conf) \ + break; \ + __sync_fetch_and_add (&conf->cumulative.data_written, \ + len); \ + __sync_fetch_and_add (&conf->incremental.data_written, \ + len); \ + __sync_fetch_and_add ( \ + &conf->cumulative.block_count_write[lb2], 1); \ + __sync_fetch_and_add ( \ + &conf->incremental.block_count_write[lb2], 1); \ + \ + if (iosfd) { \ + __sync_fetch_and_add (&iosfd->data_written, \ + len); \ + __sync_fetch_and_add ( \ + &iosfd->block_count_write[lb2], 1); \ + } \ + } while (0) +#else +#define BUMP_WRITE(fd, len) \ + do { \ + struct ios_conf *conf = NULL; \ + struct ios_fd *iosfd = NULL; \ + int lb2 = 0; \ \ - LOCK(&iosstat->lock); \ + conf = this->private; \ + lb2 = log_base2 (len); \ + ios_fd_ctx_get (fd, this, &iosfd); \ + if (!conf) \ + break; \ + LOCK (&conf->lock); \ { \ - value = STATS_ADD (iosstat->counters[type], 1); \ + conf->cumulative.data_written += len; \ + conf->incremental.data_written += len; \ + conf->cumulative.block_count_write[lb2]++; \ + conf->incremental.block_count_write[lb2]++; \ + \ + if (iosfd) { \ + iosfd->data_written += len; \ + iosfd->block_count_write[lb2]++; \ + } \ } \ - UNLOCK (&iosstat->lock); \ - ios_stat_add_to_list (&conf->list[type], \ - value, iosstat); \ + UNLOCK (&conf->lock); \ } while (0) +#endif +#ifdef HAVE_ATOMIC_BUILTINS +#define BUMP_STATS(iosstat, type) \ + do { \ + struct ios_conf *conf = NULL; \ + uint64_t value = 0; \ + \ + conf = this->private; \ + value = __sync_add_and_fetch (&iosstat->counters[type], 1); \ + ios_stat_add_to_list (&conf->list[type], \ + value, iosstat); \ + \ + } while (0) +#else +#define BUMP_STATS(iosstat, type) \ + do { \ + struct ios_conf *conf = NULL; \ + uint64_t value = 0; \ + \ + conf = this->private; \ + \ + LOCK(&iosstat->lock); \ + { \ + iosstat->counters[type]++; \ + value = iosstat->counters[type]; \ + } \ + UNLOCK (&iosstat->lock); \ + ios_stat_add_to_list (&conf->list[type], \ + value, iosstat); \ + \ + } while (0) +#endif + +#ifdef HAVE_ATOMIC_BUILTINS +#define BUMP_THROUGHPUT(iosstat, type) \ + do { \ + struct ios_conf *conf = NULL; \ + double elapsed; \ + struct timeval *begin, *end; \ + double throughput; \ + int flag = 0; \ + \ + begin = &frame->begin; \ + end = &frame->end; \ + \ + elapsed = (end->tv_sec - begin->tv_sec) * 1e6 \ + + (end->tv_usec - begin->tv_usec); \ + throughput = op_ret / elapsed; \ + \ + conf = this->private; \ + \ + if (iosstat->thru_counters[type].throughput \ + <= throughput) { \ + iosstat->thru_counters[type].throughput = \ + throughput; \ + gettimeofday (&iosstat-> \ + thru_counters[type].time, NULL); \ + flag = 1; \ + } \ + if (flag) \ + ios_stat_add_to_list (&conf->thru_list[type], \ + throughput, iosstat); \ + } while (0) +#else #define BUMP_THROUGHPUT(iosstat, type) \ do { \ struct ios_conf *conf = NULL; \ @@ -603,7 +412,7 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) throughput = op_ret / elapsed; \ \ conf = this->private; \ - STATS_LOCK (&iosstat->lock); \ + LOCK (&iosstat->lock); \ { \ if (iosstat->thru_counters[type].throughput \ <= throughput) { \ @@ -614,11 +423,79 @@ ios_track_fd (call_frame_t *frame, fd_t *fd) flag = 1; \ } \ } \ - STATS_UNLOCK (&iosstat->lock); \ - if (flag) \ + UNLOCK (&iosstat->lock); \ + if (flag) \ ios_stat_add_to_list (&conf->thru_list[type], \ throughput, iosstat); \ } while (0) +#endif + + +/* + * So why goto all this trouble? Why not just queue up some samples in + * a big list and malloc away? Well malloc is expensive relative + * to what we are measuring, so cannot have any malloc's (or worse + * callocs) in our measurement code paths. Instead, we are going to + * pre-allocate a circular buffer and collect a maximum number of samples. + * Prior to dumping them all we'll create a new buffer and swap the + * old buffer with the new, and then proceed to dump the statistics + * in our dump thread. + * + */ +ios_sample_buf_t * +ios_create_sample_buf (size_t buf_size) +{ + ios_sample_buf_t *ios_sample_buf = NULL; + ios_sample_t *ios_samples = NULL; + + ios_sample_buf = GF_CALLOC (1, + sizeof (*ios_sample_buf), + gf_io_stats_mt_ios_sample_buf); + if (!ios_sample_buf) + goto err; + + ios_samples = GF_CALLOC (buf_size, + sizeof (*ios_samples), + gf_io_stats_mt_ios_sample); + + if (!ios_samples) + goto err; + + ios_sample_buf->ios_samples = ios_samples; + ios_sample_buf->size = buf_size; + ios_sample_buf->pos = 0; + ios_sample_buf->observed = 0; + ios_sample_buf->collected = 0; + + return ios_sample_buf; +err: + GF_FREE (ios_sample_buf); + return NULL; +} + +void +ios_destroy_sample_buf (ios_sample_buf_t *ios_sample_buf) +{ + GF_FREE (ios_sample_buf->ios_samples); + GF_FREE (ios_sample_buf); +} + +static int +ios_init_sample_buf (struct ios_conf *conf) +{ + int32_t ret = -1; + + GF_ASSERT (conf); + LOCK (&conf->lock); + conf->ios_sample_buf = ios_create_sample_buf ( + conf->ios_sample_buf_size); + if (!conf->ios_sample_buf) + goto out; + ret = 0; +out: + UNLOCK (&conf->lock); + return ret; +} int ios_fd_ctx_get (fd_t *fd, xlator_t *this, struct ios_fd **iosfd) @@ -715,72 +592,6 @@ ios_inode_ctx_get (inode_t *inode, xlator_t *this, struct ios_stat **iosstat) } -/* - * So why goto all this trouble? Why not just queue up some samples in - * a big list and malloc away? Well malloc is expensive relative - * to what we are measuring, so cannot have any malloc's (or worse - * callocs) in our measurement code paths. Instead, we are going to - * pre-allocate a circular buffer and collect a maximum number of samples. - * Prior to dumping them all we'll create a new buffer and swap the - * old buffer with the new, and then proceed to dump the statistics - * in our dump thread. - * - */ -ios_sample_buf_t * -ios_create_sample_buf (size_t buf_size) -{ - ios_sample_buf_t *ios_sample_buf = NULL; - ios_sample_t *ios_samples = NULL; - - ios_sample_buf = GF_CALLOC (1, - sizeof (*ios_sample_buf), - gf_io_stats_mt_ios_sample_buf); - if (!ios_sample_buf) - goto err; - - ios_samples = GF_CALLOC (buf_size, - sizeof (*ios_samples), - gf_io_stats_mt_ios_sample); - - if (!ios_samples) - goto err; - - ios_sample_buf->ios_samples = ios_samples; - ios_sample_buf->size = buf_size; - ios_sample_buf->pos = 0; - ios_sample_buf->observed = 0; - ios_sample_buf->collected = 0; - - return ios_sample_buf; -err: - GF_FREE (ios_sample_buf); - return NULL; -} - -void -ios_destroy_sample_buf (ios_sample_buf_t *ios_sample_buf) -{ - GF_FREE (ios_sample_buf->ios_samples); - GF_FREE (ios_sample_buf); -} - -static int -ios_init_sample_buf (struct ios_conf *conf) -{ - int32_t ret = -1; - - GF_ASSERT (conf); - LOCK (&conf->lock); - conf->ios_sample_buf = ios_create_sample_buf ( - conf->ios_sample_buf_size); - if (!conf->ios_sample_buf) - goto out; - ret = 0; -out: - UNLOCK (&conf->lock); - return ret; -} - int ios_stat_add_to_list (struct ios_stat_head *list_head, uint64_t value, struct ios_stat *iosstat) @@ -843,13 +654,11 @@ ios_stat_add_to_list (struct ios_stat_head *list_head, uint64_t value, new->value = value; ios_stat_ref (iosstat); list_add_tail (&new->list, &tmp->list); - if (last) { - stat = last->iosstat; - last->iosstat = NULL; - ios_stat_unref (stat); - list_del (&last->list); - GF_FREE (last); - } + stat = last->iosstat; + last->iosstat = NULL; + ios_stat_unref (stat); + list_del (&last->list); + GF_FREE (last); if (reposition == MAX_LIST_MEMBERS) list_head->min_cnt = value; else if (min_count) { @@ -876,7 +685,7 @@ out: return 0; } -static int +static inline int ios_stats_cleanup (xlator_t *this, inode_t *inode) { @@ -918,27 +727,24 @@ io_stats_log_px_stat (xlator_t *this, ios_sample_buf_t *sample_buf, int i = 0; int px = 0; struct ios_conf *conf = NULL; - + collected = sample_buf->collected; - + px = (int)(pval * 100); pn_idx = (int)(pval * collected); pn_val = global_ios_latency[pn_idx]; - ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, - "global", px, pn_val); + ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, "global", px, pn_val); for (i = 0; i < GF_FOP_MAXVALUE; i++) { qsort (ios_latencies[i], num_fop[i], sizeof (double), gf_compare_double); pn_idx = (int)(pval*num_fop[i]); pn_val = ios_latencies[i][pn_idx]; - ios_log (this, logfp, - "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, gf_fop_list[i], px, pn_val); + ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, gf_fop_list[i], px, pn_val); } } - double io_stats_prepare_latency_samples (ios_sample_buf_t *sample_buf, int *num_fop, double **ios_latencies, @@ -970,7 +776,8 @@ io_stats_prepare_latency_samples (ios_sample_buf_t *sample_buf, int *num_fop, double io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf, - FILE* logfp, char * key_prefix, char * str_prefix) + FILE* logfp, char * key_prefix, + char * str_prefix) { double *ios_latencies[GF_FOP_MAXVALUE] = {NULL, }; double *global_ios_latency = NULL; @@ -991,8 +798,6 @@ io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf, } for (i = 0; i < GF_FOP_MAXVALUE; i++) { - qsort (ios_latencies[i], num_fop[i], sizeof (double), - gf_compare_double); ios_latencies[i] = GF_CALLOC (collected, sizeof (double), 0); @@ -1028,7 +833,9 @@ io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf, num_fop, ios_latencies, global_ios_latency); out: + GF_FREE (prefix); + GF_FREE (global_ios_latency); for (j = 0; j < i; j++) { GF_FREE (ios_latencies[j]); @@ -1038,8 +845,7 @@ out: } int -ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this, - FILE *logfp) +ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this, FILE* logfp) { struct ios_stat_list *entry = NULL; @@ -1056,7 +862,7 @@ ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this, int ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this, - FILE *logfp, ios_stats_thru_t type) + FILE* logfp, ios_stats_type_t type) { struct ios_stat_list *entry = NULL; struct timeval time = {0, }; @@ -1081,6 +887,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this, int _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { + struct ios_conf *conf = NULL; char *key_root = "storage.gluster"; char *xlator_name = NULL; char *instance_name = NULL; @@ -1088,9 +895,10 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { int bytes_written = 0; int i = 0; int ret = 0; - struct ios_conf *conf = this->private; - xlator_name = strdupa (this->name); + conf = this->private; + xlator_name = strdupa (get_top_xlator_name (this)); + for (i = 0; i < strlen (xlator_name); i++) { if (xlator_name[i] == '/') xlator_name[i] = '_'; @@ -1100,15 +908,14 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { if (conf->iamshd) { xlator_name = "shd"; } else if (conf->iamnfsd) { + instance_name = xlator_name; xlator_name = "nfsd"; - instance_name = strdupa (this->name); } else if (conf->iamgfproxyd) { + instance_name = xlator_name; xlator_name = "gfproxyd"; - instance_name = strdupa (this->name); - } - - if (strcmp (__progname, "glusterfsd") == 0) + } else if (conf->iambrickd) { key_root = "storage.gluster.brick"; + } if (instance_name) { /* +3 for 2 x "." + NULL */ @@ -1221,8 +1028,9 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, ios_log (this, logfp, "\"%s.%s.inodelk_count\": \"%u\",", key_prefix, str_prefix, inodelk_count); - ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc, &xattr, - GLUSTERFS_ENTRYLK_COUNT, NULL, NULL); + ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc, + &xattr, GLUSTERFS_ENTRYLK_COUNT, + NULL, NULL); if (ret == 0 && xattr) { if (dict_get_int32 (xattr, GLUSTERFS_ENTRYLK_COUNT, &entrylk_count) != 0) { @@ -1319,12 +1127,11 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, fop_lat_max = stats->latency[i].max; } } - if (interval == -1) { - ios_log (this, logfp, - "\"%s.%s.fop.%s.count\": \"%"PRId64"\",", - key_prefix, str_prefix, lc_fop_name, - fop_hits); - } else { + ios_log (this, logfp, + "\"%s.%s.fop.%s.count\": \"%"PRId64"\",", + key_prefix, str_prefix, lc_fop_name, + fop_hits); + if (interval != -1) { ios_log (this, logfp, "\"%s.%s.fop.%s.per_sec\": \"%0.2lf\",", key_prefix, str_prefix, lc_fop_name, @@ -1355,6 +1162,7 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, "\"%s.%s.fop.weighted_latency_ave_usec_nozerofill\": \"%0.4lf\",", key_prefix, str_prefix, weighted_fop_ave_usec); } + ios_log (this, logfp, "\"%s.%s.fop.weighted_latency_ave_usec\": \"%0.4lf\",", key_prefix, str_prefix, weighted_fop_ave_usec); @@ -1371,8 +1179,8 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, "\"%s.%s.fop.GOT1\":\"%0.4lf\",", key_prefix, str_prefix, fop_ave_usec); if (conf->dump_percentile_latencies) { - io_stats_dump_pn_latencies (this, sample_buf, logfp, - key_prefix, str_prefix); + io_stats_dump_pn_latencies (this, sample_buf, logfp, key_prefix, + str_prefix); ios_log (this, logfp, "\"%s.%s.fop.GOT2\":\"%0.4lf\",", key_prefix, str_prefix, fop_ave_usec); @@ -1381,29 +1189,32 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, "\"%s.%s.fop.GOT3\":\"%0.4lf\",", key_prefix, str_prefix, fop_ave_usec); - if (conf->iamnfsd) { + if (conf->iamnfsd || conf->iambrickd) { dict_t *xattr = NULL; - ret = syncop_getxattr (this, &unused_loc, &xattr, + ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc, &xattr, IO_THREADS_QUEUE_SIZE_KEY, NULL, NULL); if (xattr) { - // Iterate over the dictionary returned to us by io-threads and - // dump the results to the stats file. + /* + * Iterate over the dictionary returned to us by + * io-threads and dump the results to the stats file. + */ data_pair_t *curr = NULL; - dict_for_each (xattr, curr) { + dict_foreach_inline (xattr, curr) { ios_log (this, logfp, - "\"%s.%s.%s.queue_size\": \"%d\",", - key_prefix, str_prefix, curr->key, - data_to_int32 (curr->value)); + "\"%s.%s.%s.queue_size\": \"%d\",", + key_prefix, str_prefix, curr->key, + data_to_int32 (curr->value)); } - // Free the dictionary + /* Free the dictionary! */ dict_unref (xattr); } else { - gf_log (this->name, GF_LOG_WARNING, - "Unable to get queue size counts from " - "the io-threads translator!"); + gf_log (this->name, GF_LOG_DEBUG, + "Unable to get queue size counts " + "from the io-threads translator!"); } } + if (interval == -1) { ios_log (this, logfp, "\"%s.%s.uptime\": \"%"PRId64"\",", key_prefix, str_prefix, @@ -1427,7 +1238,19 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, (double)(stats->data_written / interval_sec)); } +#if OFFSET_WRITE_DETECTOR + if (conf->iamnfsd) { + ios_log (this, logfp, + ",\"%s.%s.offset_writes\": \"%"PRId64"\",", + key_prefix, str_prefix, stats->offset_write_ops); + ios_log (this, logfp, + "\"%s.%s.total_writes\": \"%"PRId64"\"", + key_prefix, str_prefix, stats->total_write_ops); + } +#endif + ios_log (this, logfp, "}"); + ret = 0; out: GF_FREE (key_prefix); @@ -1443,14 +1266,14 @@ _resolve_username (xlator_t *this, uid_t uid) char *pwd_buf = NULL; char *ret = NULL; - /* Prepare our buffer for the uid->username translation */ + // Prepare our buffer for the uid->username translation #ifdef _SC_GETGR_R_SIZE_MAX pwd_buf_len = sysconf (_SC_GETGR_R_SIZE_MAX); #else pwd_buf_len = -1; #endif if (pwd_buf_len == -1) { - pwd_buf_len = DEFAULT_PWD_BUF_SZ; /* per the man page */ + pwd_buf_len = DEFAULT_PWD_BUF_SZ; // per the man page } pwd_buf = alloca (pwd_buf_len); @@ -1482,14 +1305,14 @@ _resolve_group_name (xlator_t *this, gid_t gid) char *grp_buf = NULL; char *ret = NULL; - /* Prepare our buffer for the gid->group name translation */ + // Prepare our buffer for the gid->group name translation #ifdef _SC_GETGR_R_SIZE_MAX grp_buf_len = sysconf (_SC_GETGR_R_SIZE_MAX); #else grp_buf_len = -1; #endif if (grp_buf_len == -1) { - grp_buf_len = DEFAULT_GRP_BUF_SZ; /* per the man page */ + grp_buf_len = DEFAULT_GRP_BUF_SZ; // per the man page } grp_buf = alloca (grp_buf_len); @@ -1520,7 +1343,7 @@ err: */ void _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, - FILE *logfp) + FILE* logfp) { double epoch_time = 0.00; char *xlator_name = NULL; @@ -1531,8 +1354,12 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, char *port_pos = NULL; char *group_name = NULL; char *username = NULL; - char *path = NULL; struct ios_conf *conf = NULL; + const char *pri_str = NULL; + const char *ns_name = NULL; + const char *path = NULL; + const char *name = NULL; + loc_t *loc = NULL; const char *error_string = NULL; int32_t op_errno = 0; @@ -1541,7 +1368,7 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, epoch_time = (sample->timestamp).tv_sec + ((sample->timestamp).tv_usec / 1000000.0); - if (strlen (sample->identifier) == 0) { + if (!sample->identifier || (strlen (sample->identifier) == 0)) { hostname = "Unknown"; port = "Unknown"; } else { @@ -1553,16 +1380,12 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, if (!port) goto err; *port_pos = '\0'; - hostname = gf_rev_dns_lookup_cached (identifier, - conf->dnscache); + hostname = gf_rev_dns_lookup (identifier); if (!hostname) - hostname = "Unknown"; + hostname ="Unknown"; } - xlator_name = this->name; - if (!xlator_name || strlen (xlator_name) == 0) - xlator_name = "Unknown"; - + xlator_name = strdupa (get_top_xlator_name (this)); instance_name = this->instance_name; if (!instance_name || strlen (instance_name) == 0) instance_name = "N/A"; @@ -1581,6 +1404,24 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, sprintf (group_name, "%d", (int32_t)sample->gid); } + /* + * If the priority was unspecified, print out actual priority of the FOP, + * otherwise print the priority given on the frame. + */ + if (sample->fop_pri != GF_FOP_PRI_UNSPEC) { + pri_str = fop_pri_to_string (sample->fop_pri); + } else { + pri_str = fop_enum_to_pri_string (sample->fop_type); + } + + ns_name = get_ns_from_hash (conf, &sample->ns_info); + if (!ns_name) { + ns_name = "Unknown"; + } else if (strcmp (ns_name, "/") == 0) { + /* Give the root a readable name in the samples. */ + ns_name = "Root"; + } + path = "Unknown"; if (sample->have_path) path = sample->path; @@ -1592,11 +1433,11 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, } ios_log (this, logfp, - "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s", - epoch_time, fop_enum_to_pri_string (sample->fop_type), + "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%s,%d,%s", + epoch_time, pri_str, fop_enum_to_string (sample->fop_type), sample->elapsed, xlator_name, instance_name, username, - group_name, hostname, port, path, op_errno, error_string); + group_name, hostname, port, ns_name, path, op_errno, error_string); goto out; err: gf_log (this->name, GF_LOG_ERROR, @@ -1606,6 +1447,426 @@ out: GF_FREE (username); } +/** + * get_ns_from_hash + * + * Given a hash (produced by the namespace hashing xlator), retrieve a string + * that can be used as a key for a few other dictionaries to retrieve crucial + * info for NS throttling. + * + * @conf: Translator configuration struct. + * @info: Namespace hash information. + */ +const char * +get_ns_from_hash (struct ios_conf *conf, const ns_info_t *info) +{ + char ns_key[DICT_UINT32_KEY_SIZE]; + char *ns_name; + int ret; + + /* Handle the special case hashes. */ + if (info->found == _gf_false) { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, "NS > H: NO HASH."); + return NULL; + } + + dict_uint32_to_key (info->hash, ns_key); + ret = dict_get_str (conf->hash_to_ns, ns_key, &ns_name); + + if (ret == 0) { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, + "NS > H: %u -> %s.", + info->hash, ns_name); + return ns_name; + } else { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, + "NS > H: HASH %u NOT FOUND.", + info->hash); + return NULL; + } +} + +/** + * get_fop_hits_for_ns + * + * Given a namespace, returns the ios_fop_hits_t structure tracking + * fop averages for this namespace. + * + * @this: Pointer to the io-stats xlator. + * @ns_name: Namespace we want to find the fop_hits structure for. + */ +ios_fop_hits_t * +get_fop_hits_for_ns (const xlator_t *this, const char *ns_name) +{ + struct ios_conf *conf = this->private; + data_t *fop_hits_dict_entry_data; + ios_fop_hits_t *fop_hits; + + fop_hits_dict_entry_data = dict_get (conf->fop_hits_dict, (char *)ns_name); + + if (fop_hits_dict_entry_data) { + fop_hits = (ios_fop_hits_t *)fop_hits_dict_entry_data->data; + + if (fop_hits) { + return fop_hits; + } + } + return NULL; +} + +/** + * get_fop_rate_for_ns + * + * Returns the currently computed fop rate for the given namespace. + * + * @this: Pointer to the io-stats xlator. + * @ns_name: Namespace we want to find the fop rate for. + */ +double +get_fop_rate_for_ns (const xlator_t *this, const char *ns_name) +{ + ios_fop_hits_t *fop_hits = get_fop_hits_for_ns (this, ns_name); + return fop_hits ? fop_hits->avg_fops : -1; +} + +int64_t get_allowed_fop_rate_for_ns (struct ios_conf *conf, const char *ns_name) +{ + int64_t rate; + + if (dict_get_int64 (conf->throttling_rates, + (char *)ns_name, &rate) == 0) { + return rate; + } else { + return -1; + } +} + +gf_boolean_t +should_throttle_fop (call_frame_t *frame, xlator_t *this) +{ + struct ios_conf *conf = this->private; + const char *ns_name = NULL; + double fop_rate = 0; + int64_t allowed_fop_rate = 0; + int ret = 0; + + /* + * If throttling is not enabled, of course, do not throttle. + */ + if (!conf->throttling_enabled) { + goto no_throttle; + } + + /* + * If no throttling rates are defined, default to no throttling. + */ + if (!conf->throttling_rates) { + goto no_throttle; + } + + /* + * Attempt to find the namespace directory name from the id, and if + * cannot find it, default to no throttling. + */ + ns_name = get_ns_from_hash (conf, &frame->root->ns_info); + if (!ns_name) { + goto no_throttle; + } + + /* + * Attempt to find the current fop rate for the directory. + * If for some reason we cannot find it, default to no throttling. + */ + fop_rate = get_fop_rate_for_ns (this, ns_name); + + if (fop_rate < 0) { + goto no_throttle; + } + + allowed_fop_rate = get_allowed_fop_rate_for_ns (conf, ns_name); + + /* + * If we can't find an allowed fop rate for the namespace, or if it + * is set to (-1), then default to no throttling. + */ + if (allowed_fop_rate < 0) { + goto no_throttle; + } + + /* + * Finally, if the share has exceeded the fop rate, + * we should throttle it. + */ + if (fop_rate <= allowed_fop_rate) { + goto no_throttle; + } + + /* If we are here after all of that, finally throttle the namespace. */ + gf_log (this->name, GF_LOG_DEBUG, + "%f > %ld | Throttling FOP to namespace '%s'", + fop_rate, allowed_fop_rate, ns_name); + return _gf_true; + +no_throttle: + return _gf_false; +} + +static inline ios_fop_hits_t *ios_fop_hits_create () +{ + return GF_CALLOC (1, sizeof (ios_fop_hits_t), gf_io_stats_mt_ios_sample_buf); +} + +int +io_stats_process_ios_sample (const xlator_t *this, const ios_sample_t *sample) +{ + struct ios_conf *conf = this->private; + ios_fop_hits_t *fop_hits = NULL; + const char *ns_name = NULL; + int ret = 0; + data_t *fop_hits_dict_entry_data = NULL; + + if (!conf) { + ret = -1; + goto out; + } + + /* Silently skip if ns_info isn't initialized yet. Sometimes samples + * get dumped before the dict can be enabled, and while it's not bad, + * it's quite noisy on the brick logs. */ + if (!conf->measure_namespace_rates || !conf->fop_hits_dict) { + return 0; + } + + ns_name = get_ns_from_hash (conf, &sample->ns_info); + if (!ns_name) { + /* It is still useful to collect statistics on "unknown" samples + * so we know how many FOPs are untagged by the namespace xlator. */ + ns_name = "unknown"; + } + + fop_hits_dict_entry_data = dict_get (conf->fop_hits_dict, + (char *)ns_name); + + if (!fop_hits_dict_entry_data) { + /* + * If no FOP hits are detected for a namespace, lets create + * one and place it in the dictionary + */ + fop_hits = ios_fop_hits_create (); + fop_hits_dict_entry_data = bin_to_data (fop_hits, + sizeof (*fop_hits)); + ret = dict_set (conf->fop_hits_dict, (char *)ns_name, + fop_hits_dict_entry_data); + if (ret != 0) { + goto out; + } + } + + /* + * Let's now take the fop_hits and increment the appropriate counter. + */ + fop_hits = (ios_fop_hits_t *)fop_hits_dict_entry_data->data; + fop_hits->hits[fop_hits->idx]++; + fop_hits->elapsed[fop_hits->idx] += sample->elapsed; + +out: + return ret; +} + +int +io_stats_compute_sma_for_ns (struct ios_dump_args *args, const char *ns_name, + ios_fop_hits_t *fop_hits, dict_t *dict, uint64_t idx) +{ + xlator_t *this = args->this; + struct ios_conf *conf = this->private; + int ret = -1; + int hit_idx, i; + int num_samples = 0; + double avg_fops = 0; + double elapsed = 0; + double avg_latency = 0; + char *key_prefix = NULL; + + GF_VALIDATE_OR_GOTO (this->name, this, out); + GF_VALIDATE_OR_GOTO (this->name, ns_name, out); + GF_VALIDATE_OR_GOTO (this->name, fop_hits, out); + + ret = _io_stats_get_key_prefix (this, &key_prefix); + if (ret) { + goto out; + } + + /* For each i'th window before our current sliding index (e.g. the + * 0th window, 1st window, 2nd window preceding the current index), + * we want to try to sum the counted hits */ + for (i = 0; i < conf->ns_rate_window; i++) { + /* This gets the window at `idx - i` and wraps correctly around + * the FOP_HITS_SIZE-sized array. */ + hit_idx = (fop_hits->idx - i + FOP_HITS_SIZE) % FOP_HITS_SIZE; + gf_log (this->name, GF_LOG_DEBUG, "%s.hits[%d] = %lu * %d", + ns_name, hit_idx, fop_hits->hits[hit_idx], + conf->ios_sample_interval); + gf_log (this->name, GF_LOG_DEBUG, "%s.latency[%d] = %lf", + ns_name, hit_idx, fop_hits->elapsed[hit_idx]); + /* Accumulate the number of samples that we've seen. */ + num_samples += fop_hits->hits[hit_idx]; + /* We add the total elapsed time for all samples in our window, + * which we will divide later by the total number of processed + * samples. */ + elapsed += fop_hits->elapsed[hit_idx]; + } + + /* The number of fops is given by the samples multiplied by the sample + * interval, divided by the total length in time of the window. */ + avg_fops = (double)num_samples * conf->ios_sample_interval / + ((double)conf->ns_rate_window * conf->ios_dump_interval); + gf_log (this->name, GF_LOG_DEBUG, + "Average fop rate for %s = %lf / sec", ns_name, avg_fops); + + /* The average latency is given by the total elapsed time for all of + * the samples divided by the total number of samples. If we have 0 + * samples, just return 0. */ + avg_latency = (num_samples > 0) ? (elapsed / (double)num_samples) : 0; + gf_log (this->name, GF_LOG_DEBUG, + "Average fop latency for %s = %lf usec / fop", ns_name, avg_latency); + + if (strcmp ("/", ns_name) == 0) { + /* Give the root namespace a readable name. */ + ns_name = "root"; + } + + /* Include a ',' unless we're at the end of the dict. */ + ios_log (this, args->u.logfp, + "\"%s.namespaces.%s.fop_rate\": \"%lf\",", + key_prefix, ns_name, avg_fops); + ios_log (this, args->u.logfp, + "\"%s.namespaces.%s.avg_fop_latency\": \"%lf\"%s", + key_prefix, ns_name, avg_latency, (idx == dict->count - 1) ? "" : ","); + + fop_hits->avg_fops = avg_fops; + fop_hits->avg_latency = avg_latency; + ret = 0; +out: + if (key_prefix) { + GF_FREE (key_prefix); + } + + return ret; +} + +int +__io_stats_compute_moving_average_foreach (dict_t *this, char *key, + data_t *value, void *data, uint64_t idx) +{ + struct ios_dump_args *args = data; + xlator_t *x_this = args->this; + if (key && value) { + gf_log (x_this->name, GF_LOG_TRACE, "Will compute averages for %s", key); + ios_fop_hits_t *fop_hits = (ios_fop_hits_t *)value->data; + io_stats_compute_sma_for_ns (args, key, fop_hits, this, idx); + } + return 0; +} + +int +ios_compute_moving_average (struct ios_dump_args *args) +{ + xlator_t *this = args->this; + struct ios_conf *conf = this->private; + + if (!conf->measure_namespace_rates) { + return 0; + } + + if (!conf->fop_hits_dict) { + return 0; + } + + if (conf->fop_hits_dict->count > 0) { + ios_log (args->this, args->u.logfp, "{"); + gf_log (this->name, GF_LOG_TRACE, __func__); + dict_foreach_with_idx (conf->fop_hits_dict, __io_stats_compute_moving_average_foreach, args); + ios_log (args->this, args->u.logfp, "}"); + } + + return 0; +} + + +int +io_stats_compute_ema_for_dir (xlator_t *this, const char *dirname, + ios_fop_hits_t *fop_hits) +{ + return 0; +} + +int +__io_stats_bump_ctr_foreach (dict_t *this, char *key, data_t *value, void *data) +{ + xlator_t *x_this = (xlator_t *)data; + ios_fop_hits_t *fop_hits = (ios_fop_hits_t *)value->data; + + /* Increment counter (and wrap), then reset hits for that index. */ + fop_hits->idx = (fop_hits->idx + 1) % FOP_HITS_SIZE; + fop_hits->hits[fop_hits->idx] = 0; + fop_hits->elapsed[fop_hits->idx] = 0; + + gf_log (x_this->name, GF_LOG_TRACE, "Moved counters for %s to %ld", + key, fop_hits->idx); + + return 0; +} + +void +io_stats_bump_ctrs (xlator_t *this) +{ + struct ios_conf *conf = this->private; + + if (!conf->measure_namespace_rates) { + return; + } + + gf_log (this->name, GF_LOG_TRACE, __func__); + dict_foreach (conf->fop_hits_dict, __io_stats_bump_ctr_foreach, + this); +} + +int +__io_stats_destroy_fop_hits_dict_foreach (dict_t *this, char *key, + data_t *value, void *data) +{ + if (key && value) { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Will destroy values for %s", key); + GF_FREE (value->data); + value->data = NULL; + } + return 0; +} + +static inline void +io_stats_destroy_fop_hits_dict (dict_t **fop_hits_dict) +{ + if (!fop_hits_dict) return; + + /** + * Loop through each of the entries in the dict and destroy + * allocated values. + */ + dict_foreach (*fop_hits_dict, __io_stats_destroy_fop_hits_dict_foreach, NULL); + dict_destroy (*fop_hits_dict); + *fop_hits_dict = NULL; +} + +void +io_stats_dump_foprate_stats (struct ios_dump_args *args) +{ + /* Snapshot a moving average for this interval */ + ios_compute_moving_average (args); + + /* Bump some counters */ + io_stats_bump_ctrs (args->this); +} + /* * Takes in our sample buffer and then dumps out the contents */ @@ -1618,36 +1879,39 @@ io_stats_dump_latency_samples_logfp (xlator_t *this, FILE* logfp, /* Empty case, nothing to do, exit. */ if (sample_buf == NULL || sample_buf->collected == 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_TRACE, "No samples, dump not required."); - ret = 0; + ret =0; goto out; } - /* Wrap-around case, dump from pos to sample_buf->size -1 - * and then from 0 to sample_buf->pos (covered off by - * "simple case") - */ - if (sample_buf->collected > sample_buf->pos + 1) { + /* If we've collected more samples than the buffer can hold, then we + * should dump the whole buffer, starting from the oldest entries which + * begin at sample_buf->pos. */ + if (sample_buf->collected > sample_buf->size) { for (i = sample_buf->pos; i < sample_buf->size; i++) { - _io_stats_write_latency_sample (this, - &(sample_buf->ios_samples[i]), logfp); + ios_sample_t *sample = &(sample_buf->ios_samples[i]); + io_stats_process_ios_sample (this, sample); + _io_stats_write_latency_sample (this, sample, logfp); } } - /* Simple case: Dump from 0 to sample_buf->pos */ + /* Simple case: Dump from 0 to sample_buf->pos. These are the newest + * entries if we have wrapped around our buffer, as well. */ for (i = 0; i < sample_buf->pos; i++) { - _io_stats_write_latency_sample (this, - &(sample_buf->ios_samples[i]), logfp); + ios_sample_t *sample = &(sample_buf->ios_samples[i]); + io_stats_process_ios_sample (this, sample); + _io_stats_write_latency_sample (this, sample, logfp); } out: return ret; } + int io_stats_dump_global_to_logfp (xlator_t *this, struct ios_global_stats *stats, - struct timeval *now, int interval, FILE *logfp) + struct timeval *now, int interval, FILE* logfp) { int i = 0; int per_line = 0; @@ -1789,15 +2053,13 @@ io_stats_dump_global_to_logfp (xlator_t *this, struct ios_global_stats *stats, ios_log (this, logfp, "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)" "\tFILE NAME"); list_head = &conf->thru_list[IOS_STATS_THRU_READ]; - ios_dump_throughput_stats(list_head, this, logfp, - IOS_STATS_THRU_READ); + ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_TYPE_READ); ios_log (this, logfp, "\n======Write Throughput File Stats======"); ios_log (this, logfp, "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)" "\tFILE NAME"); list_head = &conf->thru_list[IOS_STATS_THRU_WRITE]; - ios_dump_throughput_stats (list_head, this, logfp, - IOS_STATS_THRU_WRITE); + ios_dump_throughput_stats (list_head, this, logfp, IOS_STATS_TYPE_WRITE); } return 0; } @@ -2001,58 +2263,6 @@ ios_global_stats_clear (struct ios_global_stats *stats, struct timeval *now) stats->started_at = *now; } -static int io_stats_dump_quorum (xlator_t *this, struct ios_dump_args *args) { - FILE *logf = args->u.logfp; - loc_t root_loc = {0}; - dict_t *dict = NULL; - xlator_list_t *child = NULL; - const char *leading_comma = ""; - - if (args->type != IOS_DUMP_TYPE_JSON_FILE) { - return -EINVAL; - } - - if (!this->itable->root) { - return -ENOENT; - } - - // If we don't build a valid 'loc', dht_getxattr swallows our request - // instead of passing it down to AFR. - root_loc.path = "/"; - root_loc.name = ""; - root_loc.inode = inode_ref (this->itable->root); - gf_uuid_copy (root_loc.gfid, root_loc.inode->gfid); - - ios_log (this, logf, "{"); - - for (child = this->children; child; child = child->next) { - dict = NULL; - - syncop_getxattr (child->xlator, &root_loc, &dict, - GF_AFR_QUORUM_CHECK, NULL, NULL); - - if (dict) { - const data_pair_t *e; - - dict_for_each (dict, e) { - ios_log (this, logf, - "%s\"storage.gluster.nfsd.%s\": \"%d\"", - leading_comma, - e->key, data_to_int32 (e->value)); - leading_comma = ","; - } - - dict_unref (dict); - } - } - - ios_log (this, logf, "}"); - - inode_unref (root_loc.inode); - - return 0; -} - int io_stats_dump (xlator_t *this, ios_sample_buf_t *sample_buf, struct ios_dump_args *args, gf1_cli_info_op op, @@ -2093,19 +2303,15 @@ io_stats_dump (xlator_t *this, ios_sample_buf_t *sample_buf, } UNLOCK (&conf->lock); - if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_CUMULATIVE) { + if (op == GF_CLI_INFO_ALL || + op == GF_CLI_INFO_CUMULATIVE) io_stats_dump_global (this, &cumulative, sample_buf, &now, -1, args); - } - if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_INCREMENTAL) { + if (op == GF_CLI_INFO_ALL || + op == GF_CLI_INFO_INCREMENTAL) io_stats_dump_global (this, &incremental, sample_buf, &now, increment, args); - } - - if (conf->iamnfsd) { - io_stats_dump_quorum (this, args); - } return 0; } @@ -2176,6 +2382,50 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd) return 0; } +gf_boolean_t _should_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, ios_sample_buf_t* ios_sample_buf, + int32_t op_ret, int32_t op_errno) +{ + int i; + + /* If sampling is disabled, return false */ + if (conf->ios_sample_interval == 0) + return _gf_false; + + /* Sometimes it's useful to sample errors. If `fop-sample-all-errors` + * is active, then we should sample ALL errors. */ + if (op_ret < 0 && op_errno != 0 && conf->sample_all_errors) { + return _gf_true; + } + + /* If `fop-sample-hard-errors` is active, we only look through a small + * subset of errno values to sample, those which are critical to Gluster + * functioning. */ + if (op_ret < 0 && op_errno != 0 && conf->sample_hard_errors) { + for (i = 0; i < IOS_HARD_ERROR_LIST_SIZE; i++) { + if (abs (op_errno) == ios_hard_error_list[i]) { + return _gf_true; + } + } + } + + /* If auditing is on, sample TRUNCATE, CREATE, UNLINK, RMDIR, MKDIR 1:1 */ + if (conf->audit_creates_and_unlinks) { + switch (fop_type) { + case GF_FOP_TRUNCATE: + case GF_FOP_CREATE: + case GF_FOP_UNLINK: + case GF_FOP_MKDIR: + case GF_FOP_RMDIR: + return _gf_true; + default: + break; + } + } + + /* Sample only 1 out of ios_sample_interval number of fops. */ + return (ios_sample_buf->observed % conf->ios_sample_interval == 0); +} + void ios_local_get_inode (struct ios_local *local, inode_t **inode) { if (!local) @@ -2215,8 +2465,7 @@ void ios_local_get_path (call_frame_t *frame, const char **path) */ ios_inode_ctx_get (inode, frame->this, &iosstat); if (iosstat) { - gf_log ("io-stats", GF_LOG_DEBUG, - "[%s] Getting path from iostat struct", + gf_log ("io-stats", GF_LOG_DEBUG, "[%s] Getting path from iostat struct", fop_enum_to_string (frame->op)); *path = iosstat->filename; goto out; @@ -2228,8 +2477,7 @@ void ios_local_get_path (call_frame_t *frame, const char **path) * inside the local. */ if (local->loc.path) { - gf_log ("io-stats", GF_LOG_DEBUG, - "[%s] Getting path from loc_t", + gf_log ("io-stats", GF_LOG_DEBUG, "[%s] Getting path from loc_t", fop_enum_to_string (frame->op)); *path = local->loc.path; goto out; @@ -2239,60 +2487,13 @@ out: /* If the inode and the loc don't have the path, we're out of luck. */ if (!*path) { - gf_log ("io-stats", GF_LOG_DEBUG, - "Unable to get path for fop: %s", + gf_log ("io-stats", GF_LOG_DEBUG, "Unable to get path for fop: %s", fop_enum_to_string (frame->op)); } return; } -gf_boolean_t -_should_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, - ios_sample_buf_t* ios_sample_buf, int32_t op_ret, - int32_t op_errno) -{ - int i; - - /* If sampling is disabled, return false */ - if (conf->ios_sample_interval == 0) - return _gf_false; - - /* Sometimes it's useful to sample errors. If `fop-sample-all-errors` - * is active, then we should sample ALL errors. */ - if (op_ret < 0 && op_errno != 0 && conf->sample_all_errors) { - return _gf_true; - } - - /* If `fop-sample-hard-errors` is active, we only look through a small - * subset of errno values to sample, those which are critical to Gluster - * functioning. */ - if (op_ret < 0 && op_errno != 0 && conf->sample_hard_errors) { - for (i = 0; i < IOS_HARD_ERROR_LIST_SIZE; i++) { - if (abs (op_errno) == ios_hard_error_list[i]) { - return _gf_true; - } - } - } - - /* If auditing is on, sample TRUNCATE, CREATE, UNLINK, RMDIR, MKDIR 1:1 */ - if (conf->audit_creates_and_unlinks) { - switch (fop_type) { - case GF_FOP_TRUNCATE: - case GF_FOP_CREATE: - case GF_FOP_UNLINK: - case GF_FOP_MKDIR: - case GF_FOP_RMDIR: - return _gf_true; - default: - break; - } - } - - /* Sample only 1 out of ios_sample_interval number of fops. */ - return (ios_sample_buf->observed % conf->ios_sample_interval == 0); -} - void collect_ios_latency_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, double elapsed, call_frame_t *frame, int32_t op_ret, int32_t op_errno) @@ -2304,12 +2505,11 @@ void collect_ios_latency_sample (struct ios_conf *conf, call_stack_t *root = NULL; const char *path = NULL; - ios_sample_buf = conf->ios_sample_buf; LOCK (&conf->ios_sampling_lock); - if (!_should_sample (conf, fop_type, ios_sample_buf, op_ret, op_errno)) { + + if (!_should_sample (conf, fop_type, ios_sample_buf, op_ret, op_errno)) goto out; - } timestamp = &frame->begin; root = frame->root; @@ -2317,54 +2517,44 @@ void collect_ios_latency_sample (struct ios_conf *conf, ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]); ios_sample->elapsed = elapsed; ios_sample->fop_type = fop_type; - ios_sample->op_ret = op_ret; - ios_sample->op_errno = op_errno; - ios_sample->uid = root->uid; - ios_sample->gid = root->gid; - (ios_sample->timestamp).tv_sec = timestamp->tv_sec; - (ios_sample->timestamp).tv_usec = timestamp->tv_usec; - memcpy (&ios_sample->identifier, &root->identifier, - sizeof (root->identifier)); - /* Eventually every FOP will be supported - * (i.e., the frame->local will be + /* Eventually every FOP will be supported (i.e., the frame->local will be * of type struct ios_local), but for now, this is a safety. */ switch (ios_sample->fop_type) { - - case GF_FOP_CREATE: - case GF_FOP_OPEN: - case GF_FOP_STAT: - case GF_FOP_FSTAT: - case GF_FOP_READ: - case GF_FOP_WRITE: - case GF_FOP_OPENDIR: - case GF_FOP_READDIRP: - case GF_FOP_READDIR: - case GF_FOP_FLUSH: - case GF_FOP_ACCESS: - case GF_FOP_UNLINK: - case GF_FOP_TRUNCATE: - case GF_FOP_MKDIR: - case GF_FOP_RMDIR: - case GF_FOP_SETATTR: - case GF_FOP_LOOKUP: - case GF_FOP_INODELK: - case GF_FOP_FINODELK: - case GF_FOP_ENTRYLK: - case GF_FOP_FXATTROP: - case GF_FOP_XATTROP: - case GF_FOP_GETXATTR: - case GF_FOP_FGETXATTR: - case GF_FOP_SETXATTR: - case GF_FOP_FSETXATTR: - case GF_FOP_STATFS: - case GF_FOP_FSYNC: - ios_local_get_path (frame, &path); - break; - default: - path = NULL; - break; + case GF_FOP_CREATE: + case GF_FOP_OPEN: + case GF_FOP_STAT: + case GF_FOP_FSTAT: + case GF_FOP_READ: + case GF_FOP_WRITE: + case GF_FOP_OPENDIR: + case GF_FOP_READDIRP: + case GF_FOP_READDIR: + case GF_FOP_FLUSH: + case GF_FOP_ACCESS: + case GF_FOP_UNLINK: + case GF_FOP_TRUNCATE: + case GF_FOP_MKDIR: + case GF_FOP_RMDIR: + case GF_FOP_SETATTR: + case GF_FOP_LOOKUP: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FXATTROP: + case GF_FOP_XATTROP: + case GF_FOP_GETXATTR: + case GF_FOP_FGETXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_STATFS: + case GF_FOP_FSYNC: + ios_local_get_path (frame, &path); + break; + default: + path = NULL; + break; } if (path) { @@ -2372,12 +2562,19 @@ void collect_ios_latency_sample (struct ios_conf *conf, ios_sample->have_path = _gf_true; } - /* We've reached the end of the circular buffer, start from the - * beginning. */ - if (ios_sample_buf->pos == (ios_sample_buf->size - 1)) - ios_sample_buf->pos = 0; - else - ios_sample_buf->pos++; + ios_sample->op_ret = op_ret; + ios_sample->op_errno = op_errno; + ios_sample->fop_pri = frame->pri; + ios_sample->uid = root->uid; + ios_sample->gid = root->gid; + ios_sample->ns_info = frame->root->ns_info; + (ios_sample->timestamp).tv_sec = timestamp->tv_sec; + (ios_sample->timestamp).tv_usec = timestamp->tv_usec; + memcpy (&ios_sample->identifier, &root->identifier, sizeof (root->identifier)); + + /* Increment, making sure that if we've reached the end of the circular + * buffer, then start from the beginning (modulo). */ + ios_sample_buf->pos = (ios_sample_buf->pos + 1) % ios_sample_buf->size; ios_sample_buf->collected++; out: ios_sample_buf->observed++; @@ -2547,8 +2744,7 @@ unlock_list_head: } static int -attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, - const uuid_t gfid) { +attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, const uuid_t gfid) { struct ios_stat *iosstat = NULL; if (!inode) { @@ -2557,21 +2753,22 @@ attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, ios_inode_ctx_get (inode, this, &iosstat); if (!iosstat) { - iosstat = GF_CALLOC (1, sizeof (*iosstat), - gf_io_stats_mt_ios_stat); + iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); if (!iosstat) { return -ENOMEM; } iosstat->filename = gf_strdup (path); gf_uuid_copy (iosstat->gfid, gfid); LOCK_INIT (&iosstat->lock); +#if OFFSET_WRITE_DETECTOR + iosstat->expected_write_offset = UNKNOWN_WRITE_OFFSET; +#endif ios_inode_ctx_set (inode, this, iosstat); } return 0; } - int ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd) { @@ -2613,7 +2810,6 @@ free_and_out: return ret; } - int io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, @@ -2650,12 +2846,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&conf->lock); - attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, - buf->ia_gfid); - + attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, buf->ia_gfid); unwind: UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, preparent, postparent, xdata); return 0; @@ -2681,6 +2877,7 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } conf = this->private; + ios_build_fd (this, local->loc.path, fd, &iosfd); if (!iosfd) { goto unwind; @@ -2701,16 +2898,15 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN); } - attach_iosstat_to_inode (this, local->loc.inode, - local->loc.path, - local->loc.inode->gfid); - + attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, local->loc.inode->gfid); unwind: UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } @@ -2719,7 +2915,9 @@ io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2750,7 +2948,6 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (iosstat) { BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); - } unwind: @@ -2778,39 +2975,34 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno); ios_inode_ctx_get (local->inode, this, &iosstat); - if (iosstat) { BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); } + unwind: ios_free_local (frame); STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; - } - - - int io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata) { + struct ios_local *local = NULL; struct ios_stat *iosstat = NULL; - inode_t *inode = frame->local; - frame->local = NULL; + local = frame->local; UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno); - ios_inode_ctx_get (inode, this, &iosstat); - + ios_inode_ctx_get (local->inode, this, &iosstat); if (iosstat) { - BUMP_STATS (iosstat, IOS_STATS_TYPE_READDIRP); - iosstat = NULL; + BUMP_STATS (iosstat, IOS_STATS_TYPE_READDIRP); } + ios_free_local (frame); STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2829,7 +3021,6 @@ io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ios_free_local (frame); - UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2842,8 +3033,7 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno); ios_free_local (frame); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -2894,7 +3084,6 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *sbuf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata); return 0; } @@ -2907,11 +3096,10 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_t *xdata, struct iatt *postparent) { struct ios_local *local = frame->local; - if (local && local->loc.path && inode && op_ret >= 0) { - attach_iosstat_to_inode (this, inode, local->loc.path, - inode->gfid); + attach_iosstat_to_inode (this, inode, local->loc.path, inode->gfid); } + UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno); ios_free_local (frame); STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata, @@ -2956,9 +3144,11 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct ios_local *local = frame->local; if (local && local->loc.path) { - local->inode = inode_ref (inode); - attach_iosstat_to_inode (this, inode, local->loc.path, - buf->ia_gfid); + local->inode = inode_ref (inode); /* Just for consistency's + * sake. + */ + + attach_iosstat_to_inode (this, inode, local->loc.path, buf->ia_gfid); } UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno); @@ -3008,14 +3198,12 @@ io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret < 0) goto unwind; - attach_iosstat_to_inode (this, local->inode, local->loc.path, - local->inode->gfid); + attach_iosstat_to_inode (this, local->inode, local->loc.path, local->inode->gfid); ios_fd_ctx_set (local->fd, this, 0); ios_inode_ctx_get (local->fd->inode, this, &iosstat); if (iosstat) BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR); - unwind: UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno); ios_free_local (frame); @@ -3029,7 +3217,6 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno); ios_free_local (frame); STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, @@ -3121,7 +3308,6 @@ io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); return 0; } @@ -3132,7 +3318,6 @@ io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); return 0; } @@ -3148,15 +3333,14 @@ io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * in NFS. We need to make sure that we are attaching the * data correctly to this inode. */ - if (local->loc.inode && local->loc.path) { - attach_iosstat_to_inode (this, local->loc.inode, - local->loc.path, - local->loc.inode->gfid); + if (local && local->loc.inode && local->loc.path) { + attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, local->loc.inode->gfid); } UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno); ios_free_local (frame); STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); + return 0; } @@ -3167,7 +3351,6 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -3191,8 +3374,7 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno); - ios_free_local (frame); - STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -3204,8 +3386,7 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno); - ios_free_local (frame); - STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -3216,7 +3397,6 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -3227,7 +3407,6 @@ io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno); - ios_free_local (frame); STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata); return 0; } @@ -3285,6 +3464,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this, START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_entrylk_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->entrylk, @@ -3301,6 +3482,8 @@ io_stats_inodelk (call_frame_t *frame, xlator_t *this, START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_inodelk_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->inodelk, @@ -3325,8 +3508,11 @@ io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_finodelk_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->finodelk, @@ -3340,8 +3526,11 @@ io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_xattrop_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, @@ -3355,8 +3544,11 @@ io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_fxattrop_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop, @@ -3370,8 +3562,11 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_lookup_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, @@ -3384,8 +3579,11 @@ int io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_stat_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat, @@ -3398,7 +3596,6 @@ int io_stats_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { - ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_readlink_cbk, @@ -3413,9 +3610,10 @@ int io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { - ios_track_loc (frame, loc); START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, @@ -3429,8 +3627,11 @@ io_stats_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_mkdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, @@ -3444,9 +3645,12 @@ io_stats_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); - STACK_WIND (frame, io_stats_unlink_cbk, + FOP_THROTTLE (frame, this); + + STACK_WIND_COOKIE (frame, io_stats_unlink_cbk, loc, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); @@ -3459,9 +3663,12 @@ io_stats_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); - STACK_WIND (frame, io_stats_rmdir_cbk, + FOP_THROTTLE (frame, this); + + STACK_WIND_COOKIE (frame, io_stats_rmdir_cbk, loc, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata); @@ -3475,6 +3682,8 @@ io_stats_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, { START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_symlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, @@ -3489,6 +3698,8 @@ io_stats_rename (call_frame_t *frame, xlator_t *this, { START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, @@ -3503,6 +3714,8 @@ io_stats_link (call_frame_t *frame, xlator_t *this, { START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_link_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, @@ -3516,8 +3729,11 @@ io_stats_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_setattr_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr, @@ -3531,8 +3747,11 @@ io_stats_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_truncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, @@ -3550,6 +3769,8 @@ io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc, START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_open_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, @@ -3562,13 +3783,14 @@ int io_stats_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) - { ios_track_loc (frame, loc); ios_track_fd (frame, fd); START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, @@ -3582,8 +3804,11 @@ io_stats_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_readv_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, @@ -3607,13 +3832,36 @@ io_stats_writev (call_frame_t *frame, xlator_t *this, len = iov_length (vector, count); +#if OFFSET_WRITE_DETECTOR + if (conf->iamnfsd && fd->inode) { + struct ios_stat *iosstat = NULL; + + ios_inode_ctx_get (fd->inode, this, &iosstat); + if (iosstat) { + int is_offset = iosstat->expected_write_offset != UNKNOWN_WRITE_OFFSET && + iosstat->expected_write_offset != offset; + + if (is_offset) { + ++conf->cumulative.offset_write_ops; + ++conf->incremental.offset_write_ops; + } + ++conf->cumulative.total_write_ops; + ++conf->incremental.total_write_ops; + iosstat->expected_write_offset = offset + len; + } + } +#endif + BUMP_WRITE (fd, len); START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_writev_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, flags, iobref, xdata); + return 0; } @@ -3624,6 +3872,7 @@ io_stats_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_statfs_cbk, @@ -3639,8 +3888,11 @@ io_stats_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, @@ -3654,8 +3906,11 @@ io_stats_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_fsync_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, @@ -3686,37 +3941,44 @@ conditional_dump (dict_t *dict, char *key, data_t *value, void *data) memset (filename, 0, value->len + 1); memcpy (filename, data_to_str (value), value->len); - pid = getpid (); + if (fnmatch ("*io*stat*dump", key, 0) == 0) { + pid = getpid (); - if (!strncmp (filename, "", 1)) { - gf_log (this->name, GF_LOG_ERROR, "No filename given"); - return -1; - } - logfp = fopen (filename, "w+"); - if (!logfp) { - gf_log (this->name, GF_LOG_ERROR, "failed to open %s " + + if (!strncmp (filename, "", 1)) { + gf_log (this->name, GF_LOG_ERROR, "No filename given"); + return -1; + } + logfp = fopen (filename, "w+"); + if (!logfp) { + gf_log (this->name, GF_LOG_ERROR, "failed to open %s " "for writing", filename); - return -1; - } - sprintf (dump_key, "*io*stat*%d_json_dump", pid); - if (fnmatch (dump_key, key, 0) == 0) { - (void) ios_dump_args_init ( - &args, IOS_DUMP_TYPE_JSON_FILE, - logfp); - } else { - (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE, - logfp); + return -1; + } + sprintf (dump_key, "*io*stat*%d_json_dump", pid); + if (fnmatch (dump_key, key, 0) == 0) { + (void) ios_dump_args_init ( + &args, IOS_DUMP_TYPE_JSON_FILE, + logfp); + } else { + (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE, + logfp); + } + + io_stats_dump (this, NULL, &args, GF_CLI_INFO_ALL, + _gf_false); + + fclose (logfp); } - io_stats_dump (this, NULL, &args, GF_CLI_INFO_ALL, _gf_false); - fclose (logfp); + return 0; } int _ios_destroy_dump_thread (struct ios_conf *conf) { conf->dump_thread_should_die = _gf_true; - if (conf->ios_dump_interval > 0) { - (void) pthread_cancel (conf->dump_thread); + + if (conf->dump_thread) { (void) pthread_join (conf->dump_thread, NULL); } return 0; @@ -3768,60 +4030,55 @@ _ios_dump_thread (xlator_t *this) { struct ios_conf *conf = NULL; FILE *stats_logfp = NULL; FILE *samples_logfp = NULL; + FILE *foprate_logfp = NULL; struct ios_dump_args args = {0}; ios_sample_buf_t *sample_buf = NULL; int i; int stats_bytes_written = 0; int samples_bytes_written = 0; + int foprate_bytes_written = 0; char stats_filename[PATH_MAX]; char samples_filename[PATH_MAX]; - char *xlator_name; - char *instance_name; - gf_boolean_t log_stats_fopen_failure = _gf_true; - gf_boolean_t log_samples_fopen_failure = _gf_true; - int old_cancel_type; - int ret; + char foprate_filename[PATH_MAX]; + char *xlator_name = NULL; + char *instance_name = NULL; + time_t namespace_conf_mtime; + int ret = 0; conf = this->private; + xlator_name = strdupa (get_top_xlator_name (this)); + gf_log (this->name, GF_LOG_INFO, "IO stats dump thread started, " "polling IO stats every %d seconds", conf->ios_dump_interval); - xlator_name = strdupa (this->name); + for (i = 0; i < strlen (xlator_name); i++) { if (xlator_name[i] == '/') xlator_name[i] = '_'; } + instance_name = this->instance_name; if (conf->iamshd) { xlator_name = "shd"; } else if (conf->iamnfsd) { + instance_name = xlator_name; xlator_name = "nfsd"; - instance_name = strdupa (this->name); - } else if (conf->iamgfproxyd == _gf_true) { + } else if (conf->iamgfproxyd) { + instance_name = xlator_name; xlator_name = "gfproxyd"; - instance_name = strdupa (this->name); - } - if (sys_mkdir (_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) { - if (errno != EEXIST) { - gf_log (this->name, GF_LOG_ERROR, - "could not create stats-dump directory %s", - _IOS_DUMP_DIR); - goto out; - } - } - if (sys_mkdir (_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) { - if (errno != EEXIST) { - gf_log (this->name, GF_LOG_ERROR, - "could not create stats-sample directory %s", - _IOS_SAMP_DIR); - goto out; - } } + + mkdir (_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG); + mkdir (_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG); + if (instance_name) { stats_bytes_written = snprintf (stats_filename, PATH_MAX, "%s/%s_%s_%s.dump", _IOS_DUMP_DIR, __progname, xlator_name, instance_name); - samples_bytes_written = snprintf (samples_filename, PATH_MAX, - "%s/%s_%s_%s.samp", _IOS_SAMP_DIR, + samples_bytes_written = snprintf (samples_filename, + PATH_MAX, "%s/%s_%s_%s.samp", _IOS_SAMP_DIR, + __progname, xlator_name, instance_name); + foprate_bytes_written = snprintf (foprate_filename, + PATH_MAX, "%s/%s_%s_%s_namespace_stats.dump", _IOS_DUMP_DIR, __progname, xlator_name, instance_name); } else { stats_bytes_written = snprintf (stats_filename, PATH_MAX, @@ -3830,22 +4087,40 @@ _ios_dump_thread (xlator_t *this) { samples_bytes_written = snprintf (samples_filename, PATH_MAX, "%s/%s_%s.samp", _IOS_SAMP_DIR, __progname, xlator_name); + foprate_bytes_written = snprintf (foprate_filename, PATH_MAX, + "%s/%s_%s_namespace_stats.dump", _IOS_DUMP_DIR, __progname, + xlator_name); } - if ((stats_bytes_written >= PATH_MAX) || - (samples_bytes_written >= PATH_MAX)) { + if (stats_bytes_written >= PATH_MAX || + samples_bytes_written >= PATH_MAX) { gf_log (this->name, GF_LOG_ERROR, "Invalid path for stats dump (%s) and/or latency " "samples (%s)", stats_filename, samples_filename); goto out; } - while (1) { - if (conf->dump_thread_should_die) + + gf_log (this->name, GF_LOG_INFO, + "Writing to files: stats - %s, samples - %s, foprate - %s", + stats_filename, samples_filename, foprate_filename); + + while (_gf_true) { + if (conf->dump_thread_should_die) { break; - (void) pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, - &old_cancel_type); + } + sleep (conf->ios_dump_interval); - (void) pthread_setcanceltype (PTHREAD_CANCEL_DEFERRED, - &old_cancel_type); + + /* First try and load the throttling conf file */ + if (conf->measure_namespace_rates) { + ret = ios_conf_load_namespace_conf (conf); + + if (ret != 0) { + gf_log (GF_IO_STATS, GF_LOG_WARNING, + "Failed to re-load & parse \"%s\", error=%s.", + _IOS_NAMESPACE_CONF, strerror (ret)); + } + } + /* Swap buffers */ ret = _ios_swap_sample_buf (this, &sample_buf); if (ret != 0) { @@ -3854,41 +4129,45 @@ _ios_dump_thread (xlator_t *this) { goto out; } - /* - * It's not clear whether we should reopen this each time, or - * just hold it open and rewind/truncate on each iteration. - * Leaving it alone for now. - */ stats_logfp = fopen (stats_filename, "w+"); + + /* Dump statistics */ if (stats_logfp) { - (void) ios_dump_args_init (&args, - IOS_DUMP_TYPE_JSON_FILE, - stats_logfp); - io_stats_dump (this, sample_buf, &args, - GF_CLI_INFO_ALL, _gf_false); + (void) ios_dump_args_init ( + &args, IOS_DUMP_TYPE_JSON_FILE, + stats_logfp); + io_stats_dump (this, sample_buf, &args, GF_CLI_INFO_ALL, _gf_false); fclose (stats_logfp); - log_stats_fopen_failure = _gf_true; - } else if (log_stats_fopen_failure) { - gf_log (this->name, GF_LOG_ERROR, - "could not open stats-dump file %s (%s)", - stats_filename, strerror(errno)); - log_stats_fopen_failure = _gf_false; + } else { + gf_log (this->name, GF_LOG_ERROR, "%s", + strerror (errno)); } + + /* Dump latency samples */ samples_logfp = fopen (samples_filename, "a"); if (samples_logfp) { io_stats_dump_latency_samples_logfp (this, - samples_logfp, - sample_buf); + samples_logfp, sample_buf); fclose (samples_logfp); - log_samples_fopen_failure = _gf_true; - } else if (log_samples_fopen_failure) { - gf_log (this->name, GF_LOG_ERROR, - "could not open samples-dump file %s (%s)", - samples_filename, strerror(errno)); - log_samples_fopen_failure = _gf_false; + } else { + gf_log (this->name, GF_LOG_ERROR, "%s", + strerror (errno)); } - /* cleanup sample buf */ + /* Log throttling stats only if we were asked to measure namespace rates */ + if (conf->measure_namespace_rates) { + foprate_logfp = fopen (foprate_filename, "w+"); + if (foprate_logfp) { + (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_JSON_FILE, foprate_logfp); + args.this = this; + io_stats_dump_foprate_stats (&args); + fclose (foprate_logfp); + } else { + gf_log (this->name, GF_LOG_ERROR, "%s", strerror (errno)); + } + } + + // cleanup sample buf ios_destroy_sample_buf (sample_buf); } out: @@ -3896,23 +4175,11 @@ out: return NULL; } -static gf_boolean_t -match_special_xattr (dict_t *d, char *k, data_t *val, void *mdata) -{ - gf_boolean_t ret = _gf_false; - if (fnmatch ("*io*stat*dump", k, 0) == 0) { - ret = _gf_true; - } - - return ret; -} - int io_stats_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - int ret = 0; struct { xlator_t *this; inode_t *inode; @@ -3923,13 +4190,7 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this, stub.inode = loc->inode; stub.path = loc->path; - ret = dict_foreach_match (dict, match_special_xattr, NULL, - conditional_dump, &stub); - if (ret > 0) { - /* Setxattr was on key 'io-stat-dump', hence dump and unwind - * from here */ - goto out; - } + dict_foreach (dict, conditional_dump, &stub); ios_track_loc (frame, loc); @@ -3940,10 +4201,6 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this, FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata); return 0; - -out: - STACK_UNWIND_STRICT (setxattr, frame, 0, 0, NULL); - return 0; } @@ -3952,6 +4209,7 @@ io_stats_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_getxattr_cbk, @@ -3967,6 +4225,7 @@ io_stats_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_removexattr_cbk, @@ -3983,6 +4242,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this, int32_t flags, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fsetxattr_cbk, @@ -3998,6 +4258,7 @@ io_stats_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fgetxattr_cbk, @@ -4012,7 +4273,6 @@ int io_stats_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fremovexattr_cbk, @@ -4027,9 +4287,13 @@ int io_stats_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) { + ios_track_loc (frame, loc); + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_opendir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, @@ -4041,9 +4305,12 @@ int io_stats_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *dict) { - frame->local = fd->inode; + ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_readdirp_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, @@ -4056,8 +4323,12 @@ int io_stats_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { + ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, @@ -4072,6 +4343,8 @@ io_stats_fsyncdir (call_frame_t *frame, xlator_t *this, { START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_fsyncdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir, @@ -4085,8 +4358,11 @@ io_stats_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, dict_t *xdata) { ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_access_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->access, @@ -4101,6 +4377,8 @@ io_stats_ftruncate (call_frame_t *frame, xlator_t *this, { START_FOP_LATENCY (frame); + FOP_THROTTLE (frame, this); + STACK_WIND (frame, io_stats_ftruncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, @@ -4128,6 +4406,7 @@ io_stats_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { ios_track_fd (frame, fd); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fstat_cbk, @@ -4144,6 +4423,8 @@ io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, { START_FOP_LATENCY(frame); + FOP_THROTTLE (frame, this); + STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, xdata); @@ -4411,13 +4692,73 @@ io_priv (xlator_t *this) } int +init_namespace_conf (struct ios_conf *conf) +{ + int ret; + + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Initializing structures for namespace stats tracking."); + + /* Load namespace conf for the first time. */ + ret = ios_conf_load_namespace_conf (conf); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + return ret; +} + +void +init_namespace_options (struct ios_conf *conf) +{ + if (!conf->iambrickd) { + conf->throttling_enabled = _gf_false; + conf->measure_namespace_rates = _gf_false; + gf_log (GF_IO_STATS, GF_LOG_DEBUG, + "Disabling throttling because I'm not a brick."); + return; + } + + /* Can't throttle anything without measuring namespace rates */ + /* TODO: is there a way to do this same enforcement with the namespace + * collection option? */ + if (conf->throttling_enabled) { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Throttling enabled."); + conf->measure_namespace_rates = _gf_true; + } + + if (conf->measure_namespace_rates) { + if (init_namespace_conf (conf) != 0) { + gf_log (GF_IO_STATS, GF_LOG_WARNING, + "Disabling namespace measurements even though " + "it was marked as enabled, due to failures " + "during initialization!"); + conf->throttling_enabled = _gf_false; + conf->measure_namespace_rates = _gf_false; + } else { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Namespace rates enabled."); + } + + if (conf->audit_creates_and_unlinks || conf->sample_all_errors || + conf->sample_hard_errors) { + gf_log (GF_IO_STATS, GF_LOG_WARNING, + "Fop sample auditing or 1:1 error sampling is " + "enabled, which means that namespace rates might " + "be reported incorrectly in some cases!"); + } + } +} + +int reconfigure (xlator_t *this, dict_t *options) { struct ios_conf *conf = NULL; int ret = -1; char *sys_log_str = NULL; char *log_format_str = NULL; - char *logger_str = NULL; + char *logger_str = NULL; int sys_log_level = -1; char *log_str = NULL; int log_level = -1; @@ -4425,7 +4766,6 @@ reconfigure (xlator_t *this, dict_t *options) int logger = -1; uint32_t log_buf_size = 0; uint32_t log_flush_timeout = 0; - int32_t old_dump_interval; if (!this || !this->private) goto out; @@ -4441,13 +4781,11 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("latency-measurement", conf->measure_latency, options, bool, out); - old_dump_interval = conf->ios_dump_interval; GF_OPTION_RECONF ("ios-dump-interval", conf->ios_dump_interval, options, int32, out); - if ((old_dump_interval <= 0) && (conf->ios_dump_interval > 0)) { - pthread_create (&conf->dump_thread, NULL, - (void *) &_ios_dump_thread, this); - } + + GF_OPTION_RECONF ("ns-rate-window", conf->ns_rate_window, options, + int32, out); GF_OPTION_RECONF ("ios-sample-interval", conf->ios_sample_interval, options, int32, out); @@ -4484,22 +4822,25 @@ reconfigure (xlator_t *this, dict_t *options) time, out); gf_log_set_log_flush_timeout (log_flush_timeout); - GF_OPTION_RECONF ("fop-sample-enable-audit", - conf->audit_creates_and_unlinks, options, bool, out); + GF_OPTION_RECONF ("fop-sample-enable-audit", conf->audit_creates_and_unlinks, options, bool, out); + + GF_OPTION_RECONF ("fop-sample-all-errors", conf->sample_all_errors, options, bool, out); + + GF_OPTION_RECONF ("fop-sample-hard-errors", conf->sample_hard_errors, options, bool, out); + + GF_OPTION_RECONF ("dump-percentile-latencies", conf->dump_percentile_latencies, options, bool, out); - GF_OPTION_RECONF ("fop-sample-all-errors", - conf->sample_all_errors, options, bool, out); + GF_OPTION_RECONF ("iam-gfproxy-daemon", conf->iamgfproxyd, options, bool, out); - GF_OPTION_RECONF ("fop-sample-hard-errors", - conf->sample_hard_errors, options, bool, out); + GF_OPTION_RECONF ("fop-throttling-enable", conf->throttling_enabled, options, bool, out); - GF_OPTION_RECONF ("dump-percentile-latencies", - conf->dump_percentile_latencies, options, bool, out); + GF_OPTION_RECONF ("measure-namespace-rates", conf->measure_namespace_rates, options, bool, out); + + init_namespace_options (conf); ret = 0; out: - gf_log (this ? this->name : "io-stats", - GF_LOG_DEBUG, "reconfigure returning %d", ret); + gf_log (this->name, GF_LOG_DEBUG, "reconfigure returning %d", ret); return ret; } @@ -4529,10 +4870,149 @@ ios_conf_destroy (struct ios_conf *conf) if (!conf) return; + dict_destroy (conf->hash_to_ns); + dict_destroy (conf->throttling_rates); + io_stats_destroy_fop_hits_dict (&conf->fop_hits_dict); ios_destroy_top_stats (conf); _ios_destroy_dump_thread (conf); LOCK_DESTROY (&conf->lock); - GF_FREE(conf); + GF_FREE (conf); +} + +int +ios_conf_parse_namespace_conf_line (char *file_line, dict_t *hash_to_ns, + dict_t *throttle_rates) +{ + int ret = 0; + int scanned = 0; + uint32_t ns_hash = 0; + int64_t fop_rate = 0; + char *ns_name = NULL; + char ns_key[DICT_UINT32_KEY_SIZE]; + + ns_name = GF_CALLOC (strlen (file_line), sizeof (char), 0); + + if (!ns_name) { + gf_log (GF_IO_STATS, GF_LOG_CRITICAL, + "Memory allocation error!"); + ret = ENOMEM; + goto out; + } + + scanned = sscanf (file_line, "%s %ld", ns_name, &fop_rate); + + if (scanned < 1 || strlen (ns_name) < 1) { + /* TODO(mgoulet): Log this line as skipped. */ + goto out; + } + + ns_hash = SuperFastHash (ns_name, strlen (ns_name)); + dict_uint32_to_key (ns_hash, ns_key); + ret = dict_set_dynstr_with_alloc (hash_to_ns, ns_key, ns_name); + + if (ret != 0) { + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Failed to set hash/namespace pair %u -> \'%s\'", + ns_hash, ns_name); + goto out; + } else { + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Loaded namespace %u -> \'%s\'", + ns_hash, ns_name); + } + + if (scanned == 1) { + /* We only detected a namespace, but no throttle rate, + * so let's quit now. */ + goto out; + } + + ret = dict_set_int64 (throttle_rates, ns_name, fop_rate); + + if (ret != 0) { + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Failed to set fop rate \'%s\' -> %ld", + ns_name, fop_rate); + goto out; + } else { + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Loaded fop rate \'%s\' -> %ld", + ns_name, fop_rate); + } + +out: + if (ns_name) { + GF_FREE (ns_name); + } + + return ret; +} + +int +ios_conf_load_namespace_conf (struct ios_conf *conf) +{ + int ret = 0; + FILE *fp = NULL; + char *line = NULL; + size_t len = 0; + time_t curr_mtime = {0}; + + get_file_mtime (_IOS_NAMESPACE_CONF, &curr_mtime); + if (conf->namespace_conf_mtime == curr_mtime) { + gf_log (GF_IO_STATS, GF_LOG_DEBUG, + "%s has not changed, not reloading.", + _IOS_NAMESPACE_CONF); + return 0; + } + + if (!conf->fop_hits_dict) { + conf->fop_hits_dict = dict_new (); + if (!conf->fop_hits_dict) { + ret = ENOMEM; + goto done; + } + } + + if (!conf->hash_to_ns) { + conf->hash_to_ns = dict_new (); + if (!conf->hash_to_ns) { + ret = ENOMEM; + goto done; + } + } + + if (!conf->throttling_rates) { + conf->throttling_rates = dict_new (); + if (!conf->throttling_rates) { + ret = ENOMEM; + goto done; + } + } + + fp = fopen (_IOS_NAMESPACE_CONF, "r"); + if (!fp) { + ret = ENOENT; + goto done; + } + + gf_log (GF_IO_STATS, GF_LOG_INFO, + "Reloading %s from disk.", + _IOS_NAMESPACE_CONF); + + while (getline (&line, &len, fp) != -1) { + ios_conf_parse_namespace_conf_line (line, conf->hash_to_ns, + conf->throttling_rates); + } + + free (line); + conf->namespace_conf_mtime = curr_mtime; + +done: + if (fp) { + fclose (fp); + } + + return ret; } int @@ -4550,6 +5030,7 @@ init (xlator_t *this) int ret = -1; uint32_t log_buf_size = 0; uint32_t log_flush_timeout = 0; + glusterfs_ctx_t *ctx = NULL; if (!this) return -1; @@ -4568,11 +5049,19 @@ init (xlator_t *this) "dangling volume. check volfile "); } + ctx = this->ctx; + GF_ASSERT (ctx); + + if (ctx->cmd_args.stats_instance_name) { + this->instance_name = ctx->cmd_args.stats_instance_name; + } + conf = GF_CALLOC (1, sizeof(*conf), gf_io_stats_mt_ios_conf); if (!conf) goto out; + /* * Init it just after calloc, so that we are sure the lock is inited * in case of error paths. @@ -4595,14 +5084,19 @@ init (xlator_t *this) GF_OPTION_INIT ("iam-gfproxy-daemon", conf->iamgfproxyd, bool, out); - GF_OPTION_INIT ("fop-sample-hard-errors", conf->sample_hard_errors, - bool, out); + GF_OPTION_INIT ("fop-throttling-enable", conf->throttling_enabled, bool, out); - GF_OPTION_INIT ("fop-sample-all-errors", conf->sample_all_errors, - bool, out); + GF_OPTION_INIT ("measure-namespace-rates", conf->measure_namespace_rates, bool, out); + + GF_OPTION_INIT ("fop-sample-enable-audit", conf->audit_creates_and_unlinks, bool, out); + + GF_OPTION_INIT ("fop-sample-all-errors", conf->sample_all_errors, bool, out); - GF_OPTION_INIT ("fop-sample-enable-audit", - conf->audit_creates_and_unlinks, bool, out); + GF_OPTION_INIT ("fop-sample-hard-errors", conf->sample_hard_errors, bool, out); + + GF_OPTION_INIT ("dump-percentile-latencies", conf->dump_percentile_latencies, bool, out); + + init_namespace_options (conf); GF_OPTION_INIT ("dump-fd-stats", conf->dump_fd_stats, bool, out); @@ -4614,6 +5108,9 @@ init (xlator_t *this) GF_OPTION_INIT ("ios-dump-interval", conf->ios_dump_interval, int32, out); + GF_OPTION_INIT ("ns-rate-window", conf->ns_rate_window, + int32, out); + GF_OPTION_INIT ("ios-sample-interval", conf->ios_sample_interval, int32, out); @@ -4639,8 +5136,7 @@ init (xlator_t *this) GF_OPTION_INIT ("log-level", log_str, str, out); if (log_str) { log_level = glusterd_check_log_level (log_str); - if (DEFAULT_LOG_LEVEL != log_level) - gf_log_set_loglevel (log_level); + gf_log_set_loglevel (log_level); } GF_OPTION_INIT ("logger", logger_str, str, out); @@ -4661,14 +5157,10 @@ init (xlator_t *this) GF_OPTION_INIT ("log-flush-timeout", log_flush_timeout, time, out); gf_log_set_log_flush_timeout (log_flush_timeout); - GF_OPTION_INIT ("dump-percentile-latencies", - conf->dump_percentile_latencies, bool, out); - + this->private = conf; - if (conf->ios_dump_interval > 0) { - pthread_create (&conf->dump_thread, NULL, - (void *) &_ios_dump_thread, this); - } + pthread_create (&conf->dump_thread, NULL, + (void *) &_ios_dump_thread, this); ret = 0; out: if (!this->private) { @@ -4688,9 +5180,8 @@ fini (xlator_t *this) return; conf = this->private; - - ios_conf_destroy (conf); this->private = NULL; + ios_conf_destroy (conf); gf_log (this->name, GF_LOG_INFO, "io-stats translator unloaded"); return; @@ -4708,6 +5199,7 @@ notify (xlator_t *this, int32_t event, void *data, ...) double throughput = 0; double time = 0; gf_boolean_t is_peek = _gf_false; + va_list ap; dict = data; @@ -4800,8 +5292,8 @@ notify (xlator_t *this, int32_t event, void *data, ...) (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_DICT, output); - ret = io_stats_dump (this, NULL, &args, op, - is_peek); + ret = io_stats_dump (this, NULL, &args, + op, is_peek); } } break; @@ -4876,21 +5368,13 @@ struct volume_options options[] = { .description = "If on stats related to file-operations would be " "tracked inside GlusterFS data-structures." }, - { .key = { "ios-dump-interval" }, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = 3600, - .default_value = "0", - .description = "Interval (in seconds) at which to auto-dump " - "statistics. Zero disables automatic dumping." - }, { .key = { "ios-sample-interval" }, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 65535, - .default_value = "0", + .default_value = "100", .description = "Interval in which we want to collect FOP latency " - "samples. 2 means collect a sample every 2nd FOP." + "samples. 2 means collect a sample every 2nd FOP." }, { .key = { "ios-sample-buf-size" }, .type = GF_OPTION_TYPE_INT, @@ -4899,13 +5383,27 @@ struct volume_options options[] = { .default_value = "65535", .description = "The maximum size of our FOP sampling ring buffer." }, + { .key = { "ios-dump-interval" }, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 3600, + .default_value = "5", + .description = "Interval in which we want to auto-dump statistics." + }, + { .key = { "ns-rate-window" }, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = FOP_HITS_SIZE, + .default_value = "5", + .description = "Number of ios-dump-interval length windows to calculate " + "our namespace FOP sma with." + }, { .key = { "ios-dnscache-ttl-sec" }, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 3600 * 72, - .default_value = "86400", - .description = "The interval after wish a cached DNS entry will be " - "re-validated. Default: 24 hrs" + .default_value = "43200", + .description = "Interval in which we want to auto-dump statistics." }, { .key = { "latency-measurement" }, .type = GF_OPTION_TYPE_BOOL, @@ -5026,58 +5524,71 @@ struct volume_options options[] = { "log messages that can be buffered for a time equal to" " the value of the option brick-log-flush-timeout." }, - { .key = {"iam-self-heal-daemon"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option differentiates if the io-stats " - "translator is running as part of an self-heal " - "daemon or not." - }, - { .key = {"iam-nfs-daemon"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option differentiates if the io-stats " - "translator is running as part of an NFS daemon " - "or not." + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the io-stats " + "translator is running as part of self-heal-daemon " + "or not." }, - { .key = {"iam-brick-daemon"}, + { .key = {"iam-nfs-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option differentiates if the io-stats " - "translator is running as part of brick daemon " + "translator is running as part of an NFS daemon " "or not." }, + { + .key = {"iam-brick-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the io-stats " + "translator is running as part of brick daemon " + "or not." + }, { .key = {"iam-gfproxy-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option differentiates if the io-stats " - "translator is running as part of an GFProxy daemon " + "translator is running as part of an NFS daemon " "or not." }, + { .key = {"fop-throttling-enable"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option enables whether certain namespaces " + "(exports) will be throttled when they exceed their " + "allowed FOPs/sec rate." + }, + { .key = {"measure-namespace-rates"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option enables tracking of FOP rates for individual namespaces. " + "It is automatically set to true if fop-throttling-enable is enabled." + }, { .key = {"fop-sample-enable-audit"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", - .description = "This option samples the following FOPs 1:1: " - "CREATE, UNLINK, MKDIR, RMDIR, TRUNCATE. " + .description = "This option samples the following FOPs 1:1: CREATE, UNLINK, MKDIR, RMDIR, TRUNCATE." + " fop-sample-interval must be set to something non-zero." }, { .key = {"fop-sample-all-errors"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", - .description = "This option samples all fops that failed." + .description = "This option samples all fops with a non-zero op_errno " + "value 1:1." }, { .key = {"fop-sample-hard-errors"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", - .description = "This option samples all fops with \"hard errors\"" + .description = "This option samples all fops with \"hard errors\" 1:1," "including EROFS, ENOSPC, etc." }, { .key = { "dump-percentile-latencies" }, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", - .description = "If on the p50, p90, p95, and p99 latency of each " - "operation and all types is dumped at each sample " - "interval." + .description = "If on the p50, p90, p95, and p99 latency of each operation " + "and all types is dumped at each sample interval. " }, { .key = {NULL} }, - }; diff --git a/xlators/debug/io-stats/src/io-stats.h b/xlators/debug/io-stats/src/io-stats.h new file mode 100644 index 00000000000..678f67929f4 --- /dev/null +++ b/xlators/debug/io-stats/src/io-stats.h @@ -0,0 +1,381 @@ +#ifndef _IO_STATS_H_ +#define _IO_STATS_H_ + +#define OFFSET_WRITE_DETECTOR 0 +#define UNKNOWN_WRITE_OFFSET ((off_t)-1) + +#define FOP_HITS_SIZE 32 +struct _ios_fop_hits_s { + uint64_t idx; + uint64_t hits[FOP_HITS_SIZE]; + double elapsed[FOP_HITS_SIZE]; + double avg_fops; + double avg_latency; +}; +typedef struct _ios_fop_hits_s ios_fop_hits_t; + +ios_fop_hits_t *ios_fop_hits_init (); + +#define GF_IO_STATS "io-stats" + +#define MAX_LIST_MEMBERS 100 +#define DEFAULT_PWD_BUF_SZ 16384 +#define DEFAULT_GRP_BUF_SZ 16384 +#define IOS_MAX_ERRORS 132 + +typedef enum { + IOS_STATS_TYPE_NONE, + IOS_STATS_TYPE_OPEN, + IOS_STATS_TYPE_READ, + IOS_STATS_TYPE_WRITE, + IOS_STATS_TYPE_OPENDIR, + IOS_STATS_TYPE_READDIRP, + IOS_STATS_TYPE_READ_THROUGHPUT, + IOS_STATS_TYPE_WRITE_THROUGHPUT, + IOS_STATS_TYPE_MAX +}ios_stats_type_t; + +typedef enum { + IOS_STATS_THRU_READ, + IOS_STATS_THRU_WRITE, + IOS_STATS_THRU_MAX, +}ios_stats_thru_t; + +struct ios_stat_lat { + struct timeval time; + double throughput; +}; + +struct ios_stat { + gf_lock_t lock; + uuid_t gfid; + char *filename; + uint64_t counters [IOS_STATS_TYPE_MAX]; + struct ios_stat_lat thru_counters [IOS_STATS_THRU_MAX]; + int refcnt; +#if OFFSET_WRITE_DETECTOR + off_t expected_write_offset; +#endif +}; + +struct ios_stat_list { + struct list_head list; + struct ios_stat *iosstat; + double value; +}; + +struct ios_stat_head { + gf_lock_t lock; + double min_cnt; + uint64_t members; + struct ios_stat_list *iosstats; +}; + +typedef struct _ios_sample_t { + uid_t uid; + gid_t gid; + char identifier[UNIX_PATH_MAX]; + glusterfs_fop_t fop_type; + gf_fop_pri_t fop_pri; + struct timeval timestamp; + double elapsed; + ns_info_t ns_info; + char path[4096]; + gf_boolean_t have_path; + int32_t op_ret; + int32_t op_errno; +} ios_sample_t; + + +typedef struct _ios_sample_buf_t { + uint64_t pos; /* Position in write buffer */ + uint64_t size; /* Size of ring buffer */ + uint64_t collected; /* Number of samples we've collected */ + uint64_t observed; /* Number of FOPs we've observed */ + ios_sample_t *ios_samples; /* Our list of samples */ +} ios_sample_buf_t; + + +struct ios_lat { + double min; + double max; + double avg; + uint64_t total; +}; + +struct ios_global_stats { + uint64_t data_written; + uint64_t data_read; + uint64_t block_count_write[32]; + uint64_t block_count_read[32]; + uint64_t fop_hits[GF_FOP_MAXVALUE]; + struct timeval started_at; + struct ios_lat latency[GF_FOP_MAXVALUE]; + uint64_t errno_count[IOS_MAX_ERRORS]; + uint64_t nr_opens; + uint64_t max_nr_opens; + struct timeval max_openfd_time; +#if OFFSET_WRITE_DETECTOR + uint64_t total_write_ops; + uint64_t offset_write_ops; +#endif +}; + +struct ios_conf { + gf_lock_t lock; + struct ios_global_stats cumulative; + uint64_t increment; + struct ios_global_stats incremental; + gf_boolean_t dump_fd_stats; + gf_boolean_t count_fop_hits; + gf_boolean_t measure_latency; + struct ios_stat_head list[IOS_STATS_TYPE_MAX]; + struct ios_stat_head thru_list[IOS_STATS_THRU_MAX]; + int32_t ios_dump_interval; + int32_t ns_rate_window; + pthread_t dump_thread; + gf_boolean_t dump_thread_should_die; + gf_lock_t ios_sampling_lock; + int32_t ios_sample_interval; + int32_t ios_sample_buf_size; + ios_sample_buf_t *ios_sample_buf; + struct dnscache *dnscache; + int32_t ios_dnscache_ttl_sec; + dict_t *hash_to_ns; /* Hash -> NS name */ + dict_t *fop_hits_dict; /* NS -> Real rate */ + dict_t *throttling_rates; /* NS -> Max rate */ + gf_boolean_t iamgfproxyd; + gf_boolean_t iamnfsd; + gf_boolean_t iamshd; + gf_boolean_t iambrickd; + gf_boolean_t throttling_enabled; + time_t namespace_conf_mtime; + gf_boolean_t measure_namespace_rates; + gf_boolean_t audit_creates_and_unlinks; + gf_boolean_t sample_hard_errors; + gf_boolean_t dump_percentile_latencies; + gf_boolean_t sample_all_errors; + uint32_t outstanding_req; +}; + + +struct ios_fd { + char *filename; + uint64_t data_written; + uint64_t data_read; + uint64_t block_count_write[32]; + uint64_t block_count_read[32]; + struct timeval opened_at; +}; + +typedef enum { + IOS_DUMP_TYPE_NONE = 0, + IOS_DUMP_TYPE_FILE = 1, + IOS_DUMP_TYPE_DICT = 2, + IOS_DUMP_TYPE_JSON_FILE = 3, + IOS_DUMP_TYPE_SAMPLES = 4, + IOS_DUMP_TYPE_MAX = 5 +} ios_dump_type_t; + +struct ios_dump_args { + xlator_t *this; + ios_dump_type_t type; + union { + FILE *logfp; + dict_t *dict; + } u; +}; + +typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*, + int, int, uint64_t); + +// Will be present in frame->local +struct ios_local { + inode_t *inode; + loc_t loc; + fd_t *fd; +}; + +#define ios_local_new() GF_CALLOC (1, sizeof (struct ios_local), gf_common_mt_char); + +void +ios_local_free (struct ios_local *local) +{ + if (!local) + return; + + inode_unref (local->inode); + + if (local->fd) + fd_unref (local->fd); + + loc_wipe (&local->loc); + memset (local, 0, sizeof (*local)); + GF_FREE (local); +} + +struct volume_options options[]; + +inline static int +is_fop_latency_started (call_frame_t *frame) +{ + GF_ASSERT (frame); + struct timeval epoch = {0, }; + return memcmp (&frame->begin, &epoch, sizeof (epoch)); +} + +const char *_IOS_DUMP_DIR = "/var/lib/glusterd/stats"; +const char *_IOS_SAMP_DIR = "/var/log/glusterfs/samples"; +const char *_IOS_NAMESPACE_CONF = TOSTRING (GLUSTERD_WORKDIR) "/namespaces.conf"; + +extern const char *__progname; + +/* This is a list of errors which are in some way critical. + * It is useful to sample these errors even if other errors + * should be ignored. */ +const int32_t ios_hard_error_list[] = { + EIO, + EROFS, + ENOSPC, + ENOTCONN, + ESTALE, +}; + +#define IOS_HARD_ERROR_LIST_SIZE (sizeof(ios_hard_error_list) / sizeof(int32_t)) + +const char *errno_to_name[IOS_MAX_ERRORS] = { + "success", /* 0 */ + "eperm", + "enoent", + "esrch", + "eintr", + "eio", + "enxio", + "e2big", + "enoexec", + "ebadf", + "echild", + "eagain", + "enomem", + "eacces", + "efault", + "enotblk", + "ebusy", + "eexist", + "exdev", + "enodev", + "enotdir", + "eisdir", + "einval", + "enfile", + "emfile", + "enotty", + "etxtbsy", + "efbig", + "enospc", + "espipe", + "erofs", + "emlink", + "epipe", + "edom", + "erange", + "edeadlk", + "enametoolong", + "enolck", + "enosys", + "enotempty", + "eloop", + "ewouldblock", + "enomsg", + "eidrm", + "echrng", + "el2nsync", + "el3hlt", + "el3rst", + "elnrng", + "eunatch", + "enocsi", + "el2hlt", + "ebade", + "ebadr", + "exfull", + "enoano", + "ebadrqc", + "ebadslt", + "edeadlock", + "ebfont", + "enostr", + "enodata", + "etime", + "enosr", + "enonet", + "enopkg", + "eremote", + "enolink", + "eadv", + "esrmnt", + "ecomm", + "eproto", + "emultihop", + "edotdot", + "ebadmsg", + "eoverflow", + "enotuniq", + "ebadfd", + "eremchg", + "elibacc", + "elibbad", + "elibscn", + "elibmax", + "elibexec", + "eilseq", + "erestart", + "estrpipe", + "eusers", + "enotsock", + "edestaddrreq", + "emsgsize", + "eprototype", + "enoprotoopt", + "eprotonosupport", + "esocktnosupport", + "eopnotsupp", + "epfnosupport", + "eafnosupport", + "eaddrinuse", + "eaddrnotavail", + "enetdown", + "enetunreach", + "enetreset", + "econnaborted", + "econnreset", + "enobufs", + "eisconn", + "enotconn", + "eshutdown", + "etoomanyrefs", + "etimedout", + "econnrefused", + "ehostdown", + "ehostunreach", + "ealready", + "einprogress", + "estale", + "euclean", + "enotnam", + "enavail", + "eisnam", + "eremoteio", + "edquot", + "enomedium", + "emediumtype", + "ecanceled", + "enokey", + "ekeyexpired", + "ekeyrevoked", + "ekeyrejected", + "eownerdead", + "enotrecoverable" +}; + +#endif /* _IO_STATS_H_ */ diff --git a/xlators/features/namespace/src/namespace.c b/xlators/features/namespace/src/namespace.c index 639dfa33fe2..4585087025e 100644 --- a/xlators/features/namespace/src/namespace.c +++ b/xlators/features/namespace/src/namespace.c @@ -143,7 +143,7 @@ ns_inode_ctx_put (inode_t *inode, xlator_t *this, ns_info_t *info) int ret = -1; if (!inode || !this) { - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "Need a valid inode and xlator to cache ns_info."); ret = -1; goto out; @@ -253,10 +253,10 @@ get_path_resume_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "G>P %s %10u namespace found %s", uuid_utoa (local->loc.inode->gfid), info->hash, path); } else if (ret == PATH_PARSE_RESULT_NO_PATH) { - gf_log (this->name, GF_LOG_WARNING, "G>P %s has no path", + gf_log (this->name, GF_LOG_DEBUG, "G>P %s has no path", uuid_utoa (local->loc.inode->gfid)); } else if (ret == PATH_PARSE_RESULT_IS_GFID) { - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "G>P %s winding failed, still have gfid", uuid_utoa (local->loc.inode->gfid)); } @@ -303,16 +303,26 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc * from the inode context, then from the loc's path itself. */ if (!loc || !loc->path || !loc->inode) { ret = PATH_PARSE_RESULT_NO_PATH; - } else if (!ns_inode_ctx_get (loc->inode, this, info)) { + goto out; + } + + if (!ns_inode_ctx_get (loc->inode, this, info)) { ret = PATH_PARSE_RESULT_FOUND; - } else { - ret = parse_path (info, loc->path); - gf_log (this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", - fn, loc->path); + goto out; + } - if (ret == PATH_PARSE_RESULT_FOUND) { - ns_inode_ctx_put (loc->inode, this, info); - } + if (gf_uuid_is_null (loc->inode->gfid) && gf_uuid_is_null (loc->gfid)) { + ret = PATH_PARSE_RESULT_NO_PATH; + goto out; + } + + ret = parse_path (info, loc->path); + gf_log (this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", + fn, loc->path); + + if (ret == PATH_PARSE_RESULT_FOUND) { + ns_inode_ctx_put (loc->inode, this, info); + goto out; } /* Keep trying by calling inode_path next, making sure to copy @@ -337,6 +347,7 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc } } +out: /* Report our status, and if we have a GFID, we'll eventually try a * GET_ANCESTRY_PATH_KEY wind when we return from this function. */ if (ret == PATH_PARSE_RESULT_FOUND) { @@ -344,7 +355,7 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc "%s: LOC %s %10u namespace found for %s", fn, uuid_utoa (loc->inode->gfid), info->hash, loc->path); } else if (ret == PATH_PARSE_RESULT_NO_PATH) { - gf_log (this->name, GF_LOG_WARNING, "%s: LOC has no path", fn); + gf_log (this->name, GF_LOG_DEBUG, "%s: LOC has no path", fn); } else if (ret == PATH_PARSE_RESULT_IS_GFID) { /* Make sure to copy the inode's gfid for the eventual wind. */ if (gf_uuid_is_null (loc->inode->gfid)) { @@ -381,9 +392,20 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd) * from the inode context, then inode_path. */ if (!fd || !fd->inode) { ret = PATH_PARSE_RESULT_NO_PATH; - } else if (!ns_inode_ctx_get (fd->inode, this, info)) { + goto out; + } + + if (!ns_inode_ctx_get (fd->inode, this, info)) { ret = PATH_PARSE_RESULT_FOUND; - } else if (inode_path (fd->inode, NULL, &path) >= 0 && path) { + goto out; + } + + if (gf_uuid_is_null (fd->inode->gfid)) { + ret = PATH_PARSE_RESULT_NO_PATH; + goto out; + } + + if (inode_path (fd->inode, NULL, &path) >= 0 && path) { ret = parse_path (info, path); gf_log (this->name, GF_LOG_DEBUG, "%s: FD retrieved path %s", fn, path); @@ -397,13 +419,14 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd) GF_FREE (path); } +out: /* Report our status, and if we have a GFID, we'll eventually try a * GET_ANCESTRY_PATH_KEY wind when we return from this function. */ if (ret == PATH_PARSE_RESULT_FOUND) { gf_log (this->name, GF_LOG_DEBUG, "%s: FD %s %10u namespace found", fn, uuid_utoa (fd->inode->gfid), info->hash); } else if (ret == PATH_PARSE_RESULT_NO_PATH) { - gf_log (this->name, GF_LOG_WARNING, "%s: FD has no path", fn); + gf_log (this->name, GF_LOG_DEBUG, "%s: FD has no path", fn); } else if (ret == PATH_PARSE_RESULT_IS_GFID) { gf_log (this->name, GF_LOG_DEBUG, "%s: FD %s winding, looking for path", @@ -442,6 +465,12 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd) goto wind; \ } \ \ + if (gf_uuid_is_null (inode->gfid)) { \ + gf_log (this->name, GF_LOG_DEBUG, \ + "Cannot wind on a NULL GFID."); \ + goto wind; \ + } \ + \ new_frame->root->uid = new_frame->root->gid = 0; \ /* Put a phony "not found" NS info into this call. */ \ new_frame->root->ns_info = *info; \ @@ -750,8 +779,10 @@ wind: int32_t ns_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ns_private_t *priv = (ns_private_t *)this->private; + path_parse_result_t ret = set_ns_from_loc (__FUNCTION__, frame, this, loc); - if (ret == PATH_PARSE_RESULT_IS_GFID) { + if (ret == PATH_PARSE_RESULT_IS_GFID && priv->wind_lookups) { GET_ANCESTRY_PATH_WIND (lookup, loc->inode, loc, xdata); return 0; } @@ -1229,6 +1260,7 @@ init (xlator_t *this) } GF_OPTION_INIT ("tag-namespaces", priv->tag_namespaces, bool, out); + GF_OPTION_INIT ("wind-lookups", priv->wind_lookups, bool, out); gf_log (this->name, GF_LOG_INFO, "Namespace xlator loaded"); this->private = priv; @@ -1262,6 +1294,8 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("tag-namespaces", priv->tag_namespaces, options, bool, out); + GF_OPTION_RECONF ("wind-lookups", priv->wind_lookups, options, + bool, out); ret = 0; out: @@ -1331,5 +1365,16 @@ struct volume_options options[] = { "that tags every fop with a namespace hash for later " "throttling, stats collection, logging, etc.", }, + { + .key = { "wind-lookups" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "If enabled, a GET_ANCESTRY_PATH_KEY getxattr wind " + "will be scheduled for a LOOKUP. Since we have many " + "LOOKUP requests which are sent for files that don't " + "exist, it's often benificial to disable this option, " + "so we don't have a high percentage of bad GETXATTRs " + "in io-stats.", + }, {.key = { NULL } }, }; diff --git a/xlators/features/namespace/src/namespace.h b/xlators/features/namespace/src/namespace.h index 3b9f782cb1a..affaba04681 100644 --- a/xlators/features/namespace/src/namespace.h +++ b/xlators/features/namespace/src/namespace.h @@ -13,6 +13,7 @@ typedef struct { gf_boolean_t tag_namespaces; + gf_boolean_t wind_lookups; } ns_private_t; typedef struct { diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6fcf6892d27..a07fd9c3232 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -2447,8 +2447,8 @@ out: * the topology of the brick graph */ static volgen_brick_xlator_t server_graph_table[] = { {brick_graph_add_server, NULL}, - {brick_graph_add_decompounder, "decompounder"}, {brick_graph_add_io_stats, "NULL"}, + {brick_graph_add_decompounder, "decompounder"}, {brick_graph_add_namespace, "namespace"}, {brick_graph_add_cdc, NULL}, {brick_graph_add_quota, "quota"}, diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am index 1d09eace2ed..bdbd0290fd6 100644 --- a/xlators/performance/io-threads/src/Makefile.am +++ b/xlators/performance/io-threads/src/Makefile.am @@ -1,15 +1,16 @@ xlator_LTLIBRARIES = io-threads.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -io_threads_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) +io_threads_la_LDFLAGS = -module -avoid-version io_threads_la_SOURCES = io-threads.c io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = io-threads.h iot-mem-types.h io-threads-messages.h +noinst_HEADERS = io-threads.h iot-mem-types.h -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -DGLUSTERD_WORKDIR=$(GLUSTERD_WORKDIR) AM_CFLAGS = -Wall $(GF_CFLAGS) -CLEANFILES = +CLEANFILES = diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 04d94691ba3..c0ebdd33caf 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -8,6 +8,11 @@ cases as published by the Free Software Foundation. */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + #include "call-stub.h" #include "defaults.h" #include "glusterfs.h" @@ -20,12 +25,14 @@ #include <sys/time.h> #include <time.h> #include "locking.h" -#include "io-threads-messages.h" #include "timespec.h" +#include "hashfn.h" void *iot_worker (void *arg); int iot_workers_scale (iot_conf_t *conf); int __iot_workers_scale (iot_conf_t *conf); +int32_t iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight); +int32_t __iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight); struct volume_options options[]; #define IOT_FOP(name, frame, this, args ...) \ @@ -50,84 +57,622 @@ struct volume_options options[]; } \ } while (0) +gf_boolean_t +__iot_ns_queue_empty (iot_ns_queue_t *queue) +{ + return (queue->size == 0); +} + +/* Fetch a namespace queue for the given ns_info struct. If the hash was not + * found or the clock is not enabled, then return NULL which will be substituted + * by the unknown queue. */ +iot_ns_queue_t * +__iot_get_ns_queue (iot_conf_t *conf, ns_info_t *info) +{ + char ns_key[DICT_UINT32_KEY_SIZE]; + int32_t ret = -1; + iot_ns_queue_t *queue = NULL; + + if (!conf->ns_weighted_queueing || !info->found) { + gf_log (GF_IO_THREADS, GF_LOG_TRACE, + "Namespace not found for %u", info->hash); + return NULL; + } + + dict_uint32_to_key (info->hash, ns_key); + ret = dict_get_ptr (conf->ns_queues, ns_key, (void **)&queue); + + if (ret) { + gf_log (GF_IO_THREADS, GF_LOG_TRACE, + "Namespace not found for %u", info->hash); + } + + /* If successful, return `queue`. */ + return (!ret && queue) ? queue : NULL; +} + +/* When we parse a new namespace conf file, we create a whole new set of queue + * structs; however, old requests may be sitting on old queues. This function + * drains the old requests lists into the new queue, or alternatively appends + * it onto the unknown queue if there is no corresponding new queue. + * (i.e. if it was removed from the conf file) */ +int +__iot_drain_ns_queue_foreach (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + iot_conf_t *conf = data; + dict_t *new_dict = conf->ns_queues; + iot_ns_queue_t *old_queue = data_to_ptr (value); + iot_ns_queue_t *new_queue = NULL; + int i; + + ret = dict_get_ptr (new_dict, key, (void **)&new_queue); + + /* Don't drain the unknown queue. */ + if (old_queue == conf->ns_unknown_queue) { + return 0; + } + + for (i = 0; i < IOT_PRI_MAX; i++) { + /* If we didn't find a "new queue" corresponding to the old, + * then drain into the unknown queue of that priority level. */ + if (!ret && new_queue) { + list_append_init (&old_queue[i].reqs, + &new_queue[i].reqs); + new_queue[i].size += old_queue[i].size; + } else { + list_append_init (&old_queue[i].reqs, + &conf->ns_unknown_queue[i].reqs); + conf->ns_unknown_queue[i].size += old_queue[i].size; + } + } + + return 0; +} + +/* Drain the namespace queues in old_dict, if there are any. Then free the dict + * and clear the clock structs. */ +void +__iot_drain_and_clear_clock (iot_conf_t *conf, dict_t *old_dict) +{ + int i; + + if (old_dict) { + dict_foreach (old_dict, __iot_drain_ns_queue_foreach, conf); + dict_destroy (old_dict); + } + + for (i = 0; i < IOT_PRI_MAX; i++) { + GF_FREE (conf->ns_clocks[i].slots); + conf->ns_clocks[i].slots = NULL; + conf->ns_clocks[i].idx = 0; + conf->ns_clocks[i].size = 0; + } +} + +/* Parse a single namespace conf line. This constructs a new queue and puts it + * into the namespace dictionary as well, skipping duplicated namespaces with + * a warning. */ +int32_t +__iot_ns_parse_conf_line (iot_conf_t *conf, char *file_line, uint32_t *total_weight) +{ + char ns_key[DICT_UINT32_KEY_SIZE]; + char *ns_name = NULL; + iot_ns_queue_t *queue = NULL; + int32_t queue_weight = 1; + uint32_t ns_hash = 0; + int i, ret = -1, scanned = -1; + + ns_name = GF_CALLOC (strlen (file_line), sizeof (char), 0); + if (!ns_name) { + gf_log (GF_IO_THREADS, GF_LOG_WARNING, + "Memory allocation error!"); + ret = ENOMEM; + goto out; + } + + /* Scan the line, skipping the second column which corresponds to a + * throttling rate, which we don't particularly care about. */ + scanned = sscanf (file_line, "%s %*d %d", ns_name, &queue_weight); + if (scanned < 1 || strlen (ns_name) < 1) { + gf_log (GF_IO_THREADS, GF_LOG_WARNING, + "Empty or malformatted line \"%s\" while parsing", file_line); + goto out; + } + + /* Hash the namespace name, convert it to a key, then search the dict + * for an entry matching this key. */ + ns_hash = SuperFastHash (ns_name, strlen (ns_name)); + + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "Parsed namespace \'%s\' (%u)", ns_name, ns_hash); + + dict_uint32_to_key (ns_hash, ns_key); + ret = dict_get_ptr (conf->ns_queues, ns_key, (void **)&queue); + if (!ret && queue) { + gf_log (GF_IO_THREADS, GF_LOG_WARNING, + "Duplicate-hashed queue found for namespace %s", ns_name); + /* Since ret == 0, we won't free the queue inadvertently. */ + goto out; + } + + queue = GF_CALLOC (IOT_PRI_MAX, sizeof (iot_ns_queue_t), 0); + if (!queue) { + gf_log (GF_IO_THREADS, GF_LOG_WARNING, + "Memory allocation error!"); + ret = -(ENOMEM); + goto out; + } + + /* Init queues. */ + for (i = 0; i < IOT_PRI_MAX; i++) { + INIT_LIST_HEAD (&queue[i].reqs); + queue[i].hash = ns_hash; + queue[i].weight = conf->ns_default_weight; + queue[i].size = 0; + } + + ret = dict_set_ptr (conf->ns_queues, ns_key, queue); + if (ret) { + goto out; + } + + ret = dict_set_dynstr_with_alloc (conf->hash_to_ns, ns_key, ns_name); + if (ret) { + goto out; + } + + if (scanned < 2 || queue_weight < 1) { + gf_log (GF_IO_THREADS, GF_LOG_WARNING, + "No weight (or too low) found in config line for namespace %s, " + "defaulting to weight of %u.", ns_name, conf->ns_default_weight); + } else { + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "Parsed weight \'%s\' = %u", ns_name, queue_weight); + for (i = 0; i < IOT_PRI_MAX; i++) { + queue[i].weight = (uint32_t) queue_weight; + } + } + + *total_weight += queue->weight; + +out: + if (ns_name) { + GF_FREE (ns_name); + } + + if (ret && queue) { + GF_FREE (queue); + } + + return ret; +} + +/* This function (re)initializes the clock that is used by the en/de-queue + * operations. */ +void +__iot_reinit_ns_conf (iot_conf_t *conf) +{ + char ns_unknown_key[DICT_UINT32_KEY_SIZE]; + dict_t *old_dict, *new_dict; + FILE *fp = NULL; + char *line = NULL; + size_t len = 0; + uint32_t total_weight; + int i, ret = 0; + + if (!conf) { + return; + } + + if (conf->ns_weighted_queueing) { + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "Loading %s from disk.", + _IOT_NAMESPACE_CONF); + + fp = fopen (_IOT_NAMESPACE_CONF, "r"); + if (!fp) { + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "Cannot open file for reading."); + ret = ENOENT; + goto out; + } + + /* Save the old queues; we will need to drain old requests out + * of it once we make new queues. */ + old_dict = conf->ns_queues; + conf->ns_queues = new_dict = get_new_dict (); + + /* Include the unknown queue weight, which isn't a parsed line. */ + total_weight = conf->ns_default_weight; + + /* Parse the new config file line-by-line, making a new queue + * for each namespace that is parsed. */ + while (getline (&line, &len, fp) != -1) { + __iot_ns_parse_conf_line (conf, line, &total_weight); + } + free (line); + + /* Drain old queues into new queues, or into unknown queue. */ + __iot_drain_and_clear_clock (conf, old_dict); + + /* We will add the unknown queue manually into the dictionaries. */ + dict_uint32_to_key (0, ns_unknown_key); + ret = dict_set_dynstr_with_alloc (conf->hash_to_ns, + ns_unknown_key, "unknown"); + if (ret) { + goto out; + } + + /* Set the ns_unknown_queue as static pointer so it's not freed + * in the queue drain step next time the automation. */ + ret = dict_set_static_ptr (conf->ns_queues, ns_unknown_key, + conf->ns_unknown_queue); + if (ret) { + goto out; + } + + for (i = 0; i < IOT_PRI_MAX; i++) { + ret = __iot_reinit_clock (conf, i, total_weight); + if (ret) { + goto out; + } + } + + /* Finally, keep our conf struct updated so we don't spuriously + * reconfigure the clock. */ + get_file_mtime (_IOT_NAMESPACE_CONF, &conf->ns_conf_mtime); + } + + ret = 0; +out: + if (fp) { + fclose (fp); + } + + if (ret) { + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "There was an error loading the namespaces conf file, " + "disabling clock."); + conf->ns_weighted_queueing = _gf_false; + } + + /* If our clock isn't enabled (or it was disabled because of an error) + * then drain the queues if there are any. */ + if (!conf->ns_weighted_queueing) { + old_dict = conf->ns_queues; + /* This NULL signals to the drain that we're not populating any + * new queues... */ + conf->ns_queues = NULL; + __iot_drain_and_clear_clock (conf, old_dict); + } +} + +/* This is a simple iterative algorithm which tries to allocate the lowest + * amount of slots that maintains the same proportional amount of work given + * to each namespace. + * + * Each namespace has a weight, and the proportion of "weight / total weight" + * is something we'll call the "ideal" work ratio. If we have no latency and + * infinite queues, this ratio is the amount of requests we serve (out of total) + * for the given namespace. + * + * For a given namespace idx i, let the namespace's weight be W_i, and the total + * weight be TW. The "ideal" percentage is thus given by W_i/TW. Now if we + * choose some arbitrary total number of slots TS, the number of slots we + * should give to the namespace is given by S_i = TS*(W_i/TW). Since this is + * probably not an integer, we'll actually floor the number, meaning that the + * sum of S_i for each namespace most likely doesn't equal the TS we chose + * earlier. Let this "real" total sum of slots be RS. + * + * Now we have the concept of an "realized" percentage given by the ratio of + * _allocated slots_ instead of just ideal weights, given by S_i/RS. We consider + * this set of slot allocations to be ideal if these two ratios (slots and + * weights) are "close enough", given by our ns-weight-tolerance option. + * + * We then use a loop to distribute the shares, using some modulo magic to + * get a good, semi-even distribution in the slots array. The main concern here + * is trying to make sure that no single share is bunched up. + * If we have namespaces A and B with 2 and 8 slots respectively, we should + * shoot for a distribution like [A B B B B A B B B] unstead of like + * [A A B B B B B B B B]. This loop tries its best to do that. We don't want + * to shuffle randomly either, since there is still a risk of having a bad + * bunching if our RNG is bad or we're unlucky... */ +int32_t +__iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight) +{ + int32_t ret = -1; + uint32_t try_slots, total_slots, slots_idx, rep; + data_pair_t *curr = NULL; + iot_ns_queue_t *curr_queue = NULL; + iot_ns_queue_t **slots = NULL; + char *ns_name = NULL; + gf_boolean_t fail; + double real_percentage; + + /* Initialize the "ideal" percentage each queue should have. */ + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + + curr_queue->percentage = ((double) curr_queue->weight) + / total_weight; + } + + /* We try to satisfy each percentage within some margin, first trying + * 1 slot, until we get up to the total sum of all weights, which will + * obviously satisfy each percentage but in many cases is far too large + * for a slots matrix. */ + for (try_slots = 1; try_slots <= total_weight; try_slots++) { + fail = _gf_false; + total_slots = 0; + + /* Calculate how many slots each namespace much get. Since we're + * rounding integers, we keep track of the actual total number + * of slots in `total_slots`. */ + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + + curr_queue->slots = (int) (curr_queue->percentage * try_slots); + total_slots += curr_queue->slots; + + /* If we've allocated less than 1 slot for the queue, + * we should find a larger size. */ + if (curr_queue->slots < 1) { + fail = _gf_true; + break; + } + } + + if (fail) { + continue; + } + + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + + real_percentage = ((double) curr_queue->slots) / total_slots; + /* If the realized percentage is more than ns_weight_tolerance + * percent away from the ideal percentage, then let's try + * another number. */ + if (abs (real_percentage - curr_queue->percentage) * 100.0 + > conf->ns_weight_tolerance) { + fail = _gf_true; + break; + } + } + + if (!fail) { + break; + } + } + + /* Report the fits that we have found. */ + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + + real_percentage = ((double) curr_queue->slots) / total_slots; + ret = dict_get_str (conf->hash_to_ns, curr->key, &ns_name); + + if (ret || !ns_name) { + continue; + } + + gf_log (GF_IO_THREADS, GF_LOG_INFO, + "Initializing namespace \'%s\' (weight: %d) with %d slots. " + "Ideal percentage: %0.2f%%, real percentage: %0.2f%%.", + ns_name, curr_queue->weight, curr_queue->slots, + curr_queue->percentage * 100.0, real_percentage * 100.0); + } + + /* At this point, we've either found a good fit, or gotten all the way to + * total_weight. In either case, we can start allocating slots. */ + slots = GF_CALLOC (total_slots, sizeof (iot_ns_queue_t *), 0); + slots_idx = 0; + rep = 0; + + if (!slots) { + ret = -(ENOMEM); + goto out; + } + + /* Allocate slots, with some fun modulo-math to make sure that they're + * well distributed. */ + while (total_slots != slots_idx) { + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + + if (curr_queue->slots == 0) { + continue; + } + + if (rep % (total_slots / curr_queue->slots) == 0) { + slots[slots_idx++] = curr_queue; + curr_queue->slots--; + } + } + + rep++; + } + + /* Set the slots into the queue, and we're ready to go! */ + conf->ns_clocks[i].slots = slots; + conf->ns_clocks[i].size = total_slots; + conf->ns_clocks[i].idx = 0; + + ret = 0; +out: + if (ret && slots) { + GF_FREE (slots); + } + + return ret; +} + + +void * +iot_reinit_ns_conf_thread (void *arg) +{ + xlator_t *this = arg; + iot_conf_t *conf = this->private; + time_t curr_mtime = {0, }; + + while (_gf_true) { + sleep (conf->ns_conf_reinit_secs); + + get_file_mtime (_IOT_NAMESPACE_CONF, &curr_mtime); + if (conf->ns_weighted_queueing && curr_mtime != conf->ns_conf_mtime) { + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + pthread_mutex_lock (&conf->mutex); + { + __iot_reinit_ns_conf (conf); + } + pthread_mutex_unlock (&conf->mutex); + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } else { + gf_log (GF_IO_THREADS, GF_LOG_DEBUG, + "Config file %s not modified, skipping.", + _IOT_NAMESPACE_CONF); + } + } +} + +static void +start_iot_reinit_ns_conf_thread (xlator_t *this) +{ + iot_conf_t *priv = this->private; + int ret; + + if (!priv) { + return; + } + + if (priv->reinit_ns_conf_thread_running) { + gf_log (GF_IO_THREADS, GF_LOG_INFO, "reinit_ns_conf_thread already started."); + return; + } + + gf_log (GF_IO_THREADS, GF_LOG_INFO, "Starting reinit_ns_conf_thread."); + + ret = pthread_create (&priv->reinit_ns_conf_thread, NULL, iot_reinit_ns_conf_thread, this); + if (ret == 0) { + priv->reinit_ns_conf_thread_running = _gf_true; + } else { + gf_log (this->name, GF_LOG_WARNING, + "pthread_create(iot_reinit_ns_conf_thread) failed"); + } +} + +static void +stop_iot_reinit_ns_conf_thread (xlator_t *this) +{ + iot_conf_t *priv = this->private; + + if (!priv) { + return; + } + + if (!priv->reinit_ns_conf_thread_running) { + gf_log (GF_IO_THREADS, GF_LOG_INFO, "reinit_ns_conf_thread already stopped."); + return; + } + + gf_log (GF_IO_THREADS, GF_LOG_INFO, "Stopping reinit_ns_conf_thread."); + + if (pthread_cancel (priv->reinit_ns_conf_thread) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "pthread_cancel(iot_reinit_ns_conf_thread) failed"); + } + + if (pthread_join (priv->reinit_ns_conf_thread, NULL) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "pthread_join(iot_reinit_ns_conf_thread) failed"); + } + + /* Failure probably means it's already dead. */ + priv->reinit_ns_conf_thread_running = _gf_false; +} + call_stub_t * -__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep) +__iot_dequeue (iot_conf_t *conf, int *pri) { - call_stub_t *stub = NULL; - int i = 0; - struct timeval curtv = {0,}, difftv = {0,}; + call_stub_t *stub = NULL; + iot_ns_clock_t *curr_clock; + iot_ns_queue_t *queue = NULL; + int i, start_idx; *pri = -1; - sleep->tv_sec = 0; - sleep->tv_nsec = 0; + for (i = 0; i < IOT_PRI_MAX; i++) { - if (list_empty (&conf->reqs[i]) || - (conf->ac_iot_count[i] >= conf->ac_iot_limit[i])) + if (conf->ac_iot_count[i] >= conf->ac_iot_limit[i] || + conf->queue_sizes[i] == 0) { + /* If we have too many workers currently serving this + * priority level, or no reqs at this level, skip. */ continue; + } - if (i == IOT_PRI_LEAST) { - pthread_mutex_lock(&conf->throttle.lock); - if (!conf->throttle.sample_time.tv_sec) { - /* initialize */ - gettimeofday(&conf->throttle.sample_time, NULL); - } else { - /* - * Maintain a running count of least priority - * operations that are handled over a particular - * time interval. The count is provided via - * state dump and is used as a measure against - * least priority op throttling. - */ - gettimeofday(&curtv, NULL); - timersub(&curtv, &conf->throttle.sample_time, - &difftv); - if (difftv.tv_sec >= IOT_LEAST_THROTTLE_DELAY) { - conf->throttle.cached_rate = - conf->throttle.sample_cnt; - conf->throttle.sample_cnt = 0; - conf->throttle.sample_time = curtv; - } - - /* - * If we're over the configured rate limit, - * provide an absolute time to the caller that - * represents the soonest we're allowed to - * return another least priority request. - */ - if (conf->throttle.rate_limit && - conf->throttle.sample_cnt >= - conf->throttle.rate_limit) { - struct timeval delay; - delay.tv_sec = IOT_LEAST_THROTTLE_DELAY; - delay.tv_usec = 0; - - timeradd(&conf->throttle.sample_time, - &delay, &curtv); - TIMEVAL_TO_TIMESPEC(&curtv, sleep); - - pthread_mutex_unlock( - &conf->throttle.lock); - break; - } - } - conf->throttle.sample_cnt++; - pthread_mutex_unlock(&conf->throttle.lock); - } - - stub = list_entry (conf->reqs[i].next, call_stub_t, list); + if (conf->ns_weighted_queueing) { + /* Get the clock for this priority level, and keep track + * of where the search started, so we know if we've + * searched through the whole clock. */ + curr_clock = &conf->ns_clocks[i]; + start_idx = curr_clock->idx; + + do { + /* Get the queue for the current index (modulo + * size), and increment the clock forward. */ + queue = curr_clock->slots[curr_clock->idx]; + curr_clock->idx = (curr_clock->idx + 1) % curr_clock->size; + + /* If we have a request to serve, then we're done. */ + if (!__iot_ns_queue_empty (queue)) { + break; + } + + /* Otherwise, keep searching until we've looped + * back to the start. */ + queue = NULL; + } while (curr_clock->idx != start_idx); + + /* If we have no queue, we must've not found a req to + * serve. Let's try the next priority. */ + if (!queue) { + continue; + } + } else { + /* If our unknown queue is empty, we have no other + * queues to serve. */ + if (__iot_ns_queue_empty (&conf->ns_unknown_queue[i])) { + continue; + } + + /* Select the unknown queue as the next queue to + * serve a request from. */ + queue = &conf->ns_unknown_queue[i]; + } + + /* Otherwise take a request off of the queue. */ + stub = list_first_entry (&queue->reqs, call_stub_t, list); + list_del_init (&stub->list); + + /* Increment the number of workers serving this priority, + * and record which priority we are serving. Update queue + * sizes and set `pri` variable for the caller. */ conf->ac_iot_count[i]++; conf->queue_marked[i] = _gf_false; + + conf->queue_size--; + conf->queue_sizes[i]--; + queue->size--; + *pri = i; break; } - if (!stub) - return NULL; - - conf->queue_size--; - conf->queue_sizes[*pri]--; - list_del_init (&stub->list); - return stub; } @@ -135,15 +680,29 @@ __iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep) void __iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri) { - if (pri < 0 || pri >= IOT_PRI_MAX) - pri = IOT_PRI_MAX-1; + ns_info_t *info = &stub->frame->root->ns_info; + iot_ns_queue_t *queue = 0; + + /* If we have an invalid priority, set it to LEAST. */ + if (pri < 0 || pri >= IOT_PRI_MAX) { + pri = IOT_PRI_MAX - 1; + } - list_add_tail (&stub->list, &conf->reqs[pri]); + /* Get the queue for this namespace. If we don't have one, + * use the unknown queue that always exists in the conf struct. */ + queue = __iot_get_ns_queue (conf, info); + if (!queue) { + queue = conf->ns_unknown_queue; + } + + /* Get the (pri)'th level queue, and add the request to the queue. */ + queue = &queue[pri]; + list_add_tail (&stub->list, &queue->reqs); + /* Update size records. */ conf->queue_size++; conf->queue_sizes[pri]++; - - return; + queue->size++; } @@ -156,28 +715,24 @@ iot_worker (void *data) struct timespec sleep_till = {0, }; int ret = 0; int pri = -1; - struct timespec sleep = {0,}; - gf_boolean_t bye = _gf_false; + char timeout = 0; + char bye = 0; conf = data; this = conf->this; THIS = this; + gf_log (GF_IO_THREADS, GF_LOG_DEBUG, "IOT worker spawned."); - for (;;) { + while (_gf_true) { pthread_mutex_lock (&conf->mutex); { if (pri != -1) { conf->ac_iot_count[pri]--; pri = -1; } - while (conf->queue_size == 0) { - if (conf->down) { - bye = _gf_true;/*Avoid sleep*/ - break; - } - clock_gettime (CLOCK_REALTIME_COARSE, - &sleep_till); + while (conf->queue_size == 0) { + clock_gettime (CLOCK_REALTIME_COARSE, &sleep_till); sleep_till.tv_sec += conf->idle_time; conf->sleep_count++; @@ -186,55 +741,52 @@ iot_worker (void *data) &sleep_till); conf->sleep_count--; - if (conf->down || ret == ETIMEDOUT) { - bye = _gf_true; + if (ret == ETIMEDOUT) { + timeout = 1; break; } } - if (bye) { - if (conf->down || - conf->curr_count > IOT_MIN_THREADS) { + if (timeout) { + if (conf->curr_count > IOT_MIN_THREADS) { conf->curr_count--; - if (conf->curr_count == 0) - pthread_cond_broadcast (&conf->cond); - gf_msg_debug (conf->this->name, 0, - "terminated. " - "conf->curr_count=%d", - conf->curr_count); + bye = 1; + gf_log (conf->this->name, GF_LOG_DEBUG, + "timeout, terminated. conf->curr_count=%d", + conf->curr_count); } else { - bye = _gf_false; + timeout = 0; } } - if (!bye) { - stub = __iot_dequeue (conf, &pri, &sleep); - if (!stub && (sleep.tv_sec || sleep.tv_nsec)) { - pthread_cond_timedwait(&conf->cond, - &conf->mutex, - &sleep); - pthread_mutex_unlock(&conf->mutex); - continue; - } - } + stub = __iot_dequeue (conf, &pri); } pthread_mutex_unlock (&conf->mutex); if (stub) { /* guard against spurious wakeups */ if (stub->poison) { gf_log (this->name, GF_LOG_INFO, - "Dropping poisoned request %p.", stub); + "dropping poisoned request %p", stub); call_stub_destroy (stub); } else { call_resume (stub); } } - stub = NULL; - if (bye) + if (bye) { break; + } + } + + if (pri != -1) { + pthread_mutex_lock (&conf->mutex); + { + conf->ac_iot_count[pri]--; + } + pthread_mutex_unlock (&conf->mutex); } + gf_log (GF_IO_THREADS, GF_LOG_DEBUG, "IOT worker died."); return NULL; } @@ -273,6 +825,9 @@ iot_get_pri_meaning (iot_pri_t pri) { char *name = NULL; switch (pri) { + case IOT_PRI_UNSPEC: + name = "unspecified"; + break; case IOT_PRI_HI: name = "fast"; break; @@ -288,13 +843,11 @@ iot_get_pri_meaning (iot_pri_t pri) case IOT_PRI_MAX: name = "invalid"; break; - case IOT_PRI_UNSPEC: - name = "unspecified"; - break; } return name; } + int iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) { @@ -307,75 +860,17 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) goto out; } - switch (stub->fop) { - case GF_FOP_OPEN: - case GF_FOP_STAT: - case GF_FOP_FSTAT: - case GF_FOP_LOOKUP: - case GF_FOP_ACCESS: - case GF_FOP_READLINK: - case GF_FOP_OPENDIR: - case GF_FOP_STATFS: - case GF_FOP_READDIR: - case GF_FOP_READDIRP: - case GF_FOP_GETACTIVELK: - case GF_FOP_SETACTIVELK: - pri = IOT_PRI_HI; - break; - - case GF_FOP_CREATE: - case GF_FOP_FLUSH: - case GF_FOP_LK: - case GF_FOP_INODELK: - case GF_FOP_FINODELK: - case GF_FOP_ENTRYLK: - case GF_FOP_FENTRYLK: - case GF_FOP_LEASE: - case GF_FOP_UNLINK: - case GF_FOP_SETATTR: - case GF_FOP_FSETATTR: - case GF_FOP_MKNOD: - case GF_FOP_MKDIR: - case GF_FOP_RMDIR: - case GF_FOP_SYMLINK: - case GF_FOP_RENAME: - case GF_FOP_LINK: - case GF_FOP_SETXATTR: - case GF_FOP_GETXATTR: - case GF_FOP_FGETXATTR: - case GF_FOP_FSETXATTR: - case GF_FOP_REMOVEXATTR: - case GF_FOP_FREMOVEXATTR: - pri = IOT_PRI_NORMAL; - break; + if (frame->pri != IOT_PRI_UNSPEC) { + pri = frame->pri; + goto out; + } - case GF_FOP_READ: - case GF_FOP_WRITE: - case GF_FOP_FSYNC: - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - case GF_FOP_FSYNCDIR: - case GF_FOP_XATTROP: - case GF_FOP_FXATTROP: - case GF_FOP_RCHECKSUM: - case GF_FOP_FALLOCATE: - case GF_FOP_DISCARD: - case GF_FOP_ZEROFILL: - pri = IOT_PRI_LO; - break; + // Retrieve the fop priority + pri = iot_fop_to_pri (stub->fop); - case GF_FOP_FORGET: - case GF_FOP_RELEASE: - case GF_FOP_RELEASEDIR: - case GF_FOP_GETSPEC: - break; - case GF_FOP_IPC: - default: - return -EINVAL; - } out: - gf_msg_debug (this->name, 0, "%s scheduled as %s fop", - gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); + gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop", + gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); ret = do_iot_schedule (this->private, stub, pri); return ret; } @@ -619,34 +1114,115 @@ iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, } +/* Populate all queue size keys for a specific priority level. */ int -iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) +__iot_populate_ns_queue_sizes (iot_conf_t *conf, dict_t *depths, int pri) +{ + int ret = 0; + data_pair_t *curr = NULL; + iot_ns_queue_t *curr_queue = NULL; + char *temp_key = NULL; + char *ns_name = NULL; + const char *pri_str = fop_pri_to_string (pri);; + + /* For each namespace at this priority level, we try to grab the n + * namespace name and record a key like `namespaces.NAME.PRI_LEVEL`. */ + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[pri]; + + /* Try to retrieve the namespace's un-hashed real name. */ + ret = dict_get_str (conf->hash_to_ns, curr->key, &ns_name); + if (ret || !ns_name) { + continue; + } + + /* Give the root namespace a readable name. */ + if (strncmp (ns_name, "/", 1) == 0) { + ns_name = "root"; + } + + /* Print a new temporary key for the namespace and priority level. */ + ret = gf_asprintf (&temp_key, "namespaces.%s.%s", ns_name, pri_str); + if (ret == -1 || !temp_key) { + ret = -(ENOMEM); + goto out; + } + + /* Insert the key and queue-size. */ + ret = dict_set_int32 (depths, temp_key, curr_queue->size); + GF_FREE (temp_key); + temp_key = NULL; + + if (ret) { + goto out; + } + } + +out: + return ret; +} + +/* Populate global queue counts (per-priority) and if namespace queueing is + * enabled, then also add those keys to the dictionary as well. */ +int +__iot_populate_queue_sizes (iot_conf_t *conf, dict_t **depths) { - iot_conf_t *conf = NULL; - dict_t *depths = NULL; - int i = 0; + int ret = 0, pri = 0; + const char *pri_str = NULL; - conf = this->private; + /* We explicitly do not want a reference count for this dict + * in this translator, since it will get freed in io_stats later. */ + *depths = get_new_dict (); + if (!*depths) { + return -(ENOMEM); + } - if (conf && name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { - // We explicitly do not want a reference count - // for this dict in this translator - depths = get_new_dict (); - if (!depths) - goto unwind_special_getxattr; + for (pri = 0; pri < IOT_PRI_MAX; pri++) { + pri_str = fop_pri_to_string (pri); - for (i = 0; i < IOT_PRI_MAX; i++) { - if (dict_set_int32 (depths, - (char *)fop_pri_to_string (i), - conf->queue_sizes[i]) != 0) { - dict_destroy (depths); - depths = NULL; - goto unwind_special_getxattr; + /* First, let's add an entry for the number of requests + * per prority (globally). */ + ret = dict_set_int32 (*depths, (char *)pri_str, + conf->queue_sizes[pri]); + if (ret) { + goto out; + } + + /* If namespace queueing is enabled, then try to populate + * per-namespace queue keys as well. */ + if (conf->ns_weighted_queueing) { + ret = __iot_populate_ns_queue_sizes (conf, *depths, pri); + if (ret) { + goto out; } } + } + +out: + if (ret) { + dict_destroy (*depths); + *depths = NULL; + } + + return ret; +} + +int +iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + iot_conf_t *conf = this->private; + dict_t *depths = NULL; + + if (name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { + /* Populate depths dict, or it'll stay NULL if an error occurred. */ + pthread_mutex_lock (&conf->mutex); + { + __iot_populate_queue_sizes (conf, &depths); + } + pthread_mutex_unlock (&conf->mutex); -unwind_special_getxattr: STACK_UNWIND_STRICT (getxattr, frame, 0, 0, depths, xdata); return 0; } @@ -824,10 +1400,9 @@ __iot_workers_scale (iot_conf_t *conf) ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf); if (ret == 0) { conf->curr_count++; - gf_msg_debug (conf->this->name, 0, - "scaled threads to %d (queue_size=%d/%d)", - conf->curr_count, - conf->queue_size, scale); + gf_log (conf->this->name, GF_LOG_DEBUG, + "scaled threads to %d (queue_size=%d/%d)", + conf->curr_count, conf->queue_size, scale); } else { break; } @@ -872,13 +1447,11 @@ set_stack_size (iot_conf_t *conf) if (err == EINVAL) { err = pthread_attr_getstacksize (&conf->w_attr, &stacksize); if (!err) - gf_msg (this->name, GF_LOG_WARNING, - 0, IO_THREADS_MSG_SIZE_NOT_SET, + gf_log (this->name, GF_LOG_WARNING, "Using default thread stack size %zd", stacksize); else - gf_msg (this->name, GF_LOG_WARNING, - 0, IO_THREADS_MSG_SIZE_NOT_SET, + gf_log (this->name, GF_LOG_WARNING, "Using default thread stack size"); } @@ -897,9 +1470,8 @@ mem_acct_init (xlator_t *this) ret = xlator_mem_acct_init (this, gf_iot_mt_end + 1); if (ret != 0) { - gf_msg (this->name, GF_LOG_ERROR, - ENOMEM, IO_THREADS_MSG_NO_MEMORY, - "Memory accounting init failed"); + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); return ret; } @@ -938,10 +1510,6 @@ iot_priv_dump (xlator_t *this) gf_proc_dump_write("least_priority_threads", "%d", conf->ac_iot_limit[IOT_PRI_LEAST]); - gf_proc_dump_write("cached least rate", "%u", - conf->throttle.cached_rate); - gf_proc_dump_write("least rate limit", "%u", conf->throttle.rate_limit); - return 0; } @@ -1107,13 +1675,16 @@ stop_iot_watchdog (xlator_t *this) int reconfigure (xlator_t *this, dict_t *options) { - iot_conf_t *conf = NULL; - int ret = -1; + iot_conf_t *conf = NULL; + int i, ret = -1; + gf_boolean_t ns_weighted_queueing = _gf_false; conf = this->private; if (!conf) goto out; + GF_OPTION_RECONF ("iam-brick-daemon", conf->iambrickd, options, bool, out); + GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio, @@ -1132,25 +1703,52 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("least-prio-threads", conf->ac_iot_limit[IOT_PRI_LEAST], options, int32, out); + GF_OPTION_RECONF ("enable-least-priority", conf->least_priority, options, bool, out); - GF_OPTION_RECONF ("least-rate-limit", conf->throttle.rate_limit, - options, int32, out); - GF_OPTION_RECONF ("cleanup-disconnected-reqs", conf->cleanup_disconnected_reqs, options, bool, out); GF_OPTION_RECONF ("watchdog-secs", conf->watchdog_secs, options, int32, out); - if (conf->watchdog_secs > 0) { start_iot_watchdog (this); } else { stop_iot_watchdog (this); } - ret = 0; + GF_OPTION_RECONF ("ns-weighted-queueing", ns_weighted_queueing, options, bool, out); + GF_OPTION_RECONF ("ns-default-weight", conf->ns_default_weight, options, uint32, out); + GF_OPTION_RECONF ("ns-weight-tolerance", conf->ns_weight_tolerance, options, double, out); + GF_OPTION_RECONF ("ns-conf-reinit-secs", conf->ns_conf_reinit_secs, options, + uint32, out); + + if (!conf->iambrickd) { + ns_weighted_queueing = _gf_false; + } + + /* Reinit the default weight, which is the weight of the unknown queue. */ + for (i = 0; i < IOT_PRI_MAX; i++) { + conf->ns_unknown_queue[i].weight = conf->ns_default_weight; + } + + if (ns_weighted_queueing != conf->ns_weighted_queueing) { + pthread_mutex_lock (&conf->mutex); + { + conf->ns_weighted_queueing = ns_weighted_queueing; + __iot_reinit_ns_conf (conf); + } + pthread_mutex_unlock (&conf->mutex); + } + + if (conf->ns_weighted_queueing) { + start_iot_reinit_ns_conf_thread (this); + } else { + stop_iot_reinit_ns_conf_thread (this); + } + + ret = 0; out: return ret; } @@ -1160,49 +1758,43 @@ int init (xlator_t *this) { iot_conf_t *conf = NULL; - int ret = -1; - int i = 0; + int i, ret = -1; if (!this->children || this->children->next) { - gf_msg ("io-threads", GF_LOG_ERROR, 0, - IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, - "FATAL: iot not configured " - "with exactly one child"); + gf_log ("io-threads", GF_LOG_ERROR, + "FATAL: iot not configured with exactly one child"); goto out; } if (!this->parents) { - gf_msg (this->name, GF_LOG_WARNING, 0, - IO_THREADS_MSG_VOL_MISCONFIGURED, - "dangling volume. check volfile "); + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); } conf = (void *) GF_CALLOC (1, sizeof (*conf), gf_iot_mt_iot_conf_t); if (conf == NULL) { - gf_msg (this->name, GF_LOG_ERROR, ENOMEM, - IO_THREADS_MSG_NO_MEMORY, "out of memory"); + gf_log (this->name, GF_LOG_ERROR, + "out of memory"); goto out; } - if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - IO_THREADS_MSG_INIT_FAILED, + if ((ret = pthread_cond_init (&conf->cond, NULL)) != 0) { + gf_log (this->name, GF_LOG_ERROR, "pthread_cond_init failed (%d)", ret); goto out; } - conf->cond_inited = _gf_true; - if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - IO_THREADS_MSG_INIT_FAILED, + if ((ret = pthread_mutex_init (&conf->mutex, NULL)) != 0) { + gf_log (this->name, GF_LOG_ERROR, "pthread_mutex_init failed (%d)", ret); goto out; } - conf->mutex_inited = _gf_true; set_stack_size (conf); + GF_OPTION_INIT ("iam-brick-daemon", conf->iambrickd, bool, out); + GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio, @@ -1228,33 +1820,45 @@ init (xlator_t *this) GF_OPTION_INIT ("cleanup-disconnected-reqs", conf->cleanup_disconnected_reqs, bool, out); - GF_OPTION_INIT ("least-rate-limit", conf->throttle.rate_limit, int32, - out); + conf->this = this; + + GF_OPTION_INIT ("ns-weighted-queueing", conf->ns_weighted_queueing, bool, out); + GF_OPTION_INIT ("ns-default-weight", conf->ns_default_weight, uint32, out); + GF_OPTION_INIT ("ns-weight-tolerance", conf->ns_weight_tolerance, double, out); + GF_OPTION_INIT ("ns-conf-reinit-secs", conf->ns_conf_reinit_secs, uint32, out); - if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - IO_THREADS_MSG_INIT_FAILED, - "pthread_mutex_init failed (%d)", ret); - goto out; + for (i = 0; i < IOT_PRI_MAX; i++) { + INIT_LIST_HEAD (&conf->ns_unknown_queue[i].reqs); + conf->ns_unknown_queue[i].hash = 0; + conf->ns_unknown_queue[i].weight = conf->ns_default_weight; + conf->ns_unknown_queue[i].size = 0; } - conf->this = this; + if (!conf->iambrickd) { + conf->ns_weighted_queueing = _gf_false; + } - for (i = 0; i < IOT_PRI_MAX; i++) { - INIT_LIST_HEAD (&conf->reqs[i]); + conf->hash_to_ns = dict_new (); + + pthread_mutex_lock (&conf->mutex); + { + __iot_reinit_ns_conf (conf); } + pthread_mutex_unlock (&conf->mutex); ret = iot_workers_scale (conf); - if (ret == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - IO_THREADS_MSG_INIT_FAILED, + gf_log (this->name, GF_LOG_ERROR, "cannot initialize worker threads, exiting init"); goto out; } this->private = conf; + if (conf->ns_weighted_queueing) { + start_iot_reinit_ns_conf_thread (this); + } + conf->watchdog_secs = 0; GF_OPTION_INIT ("watchdog-secs", conf->watchdog_secs, int32, out); if (conf->watchdog_secs > 0) { @@ -1263,57 +1867,24 @@ init (xlator_t *this) ret = 0; out: - if (ret) + if (ret) { GF_FREE (conf); + } return ret; } -static void -iot_exit_threads (iot_conf_t *conf) -{ - pthread_mutex_lock (&conf->mutex); - { - conf->down = _gf_true; - /*Let all the threads know that xl is going down*/ - pthread_cond_broadcast (&conf->cond); - while (conf->curr_count)/*Wait for threads to exit*/ - pthread_cond_wait (&conf->cond, &conf->mutex); - } - pthread_mutex_unlock (&conf->mutex); -} - -int -notify (xlator_t *this, int32_t event, void *data, ...) -{ - iot_conf_t *conf = this->private; - - if (GF_EVENT_PARENT_DOWN == event) - iot_exit_threads (conf); - - default_notify (this, event, data); - - return 0; -} void fini (xlator_t *this) { iot_conf_t *conf = this->private; - if (!conf) - return; - - if (conf->mutex_inited && conf->cond_inited) - iot_exit_threads (conf); - - if (conf->cond_inited) - pthread_cond_destroy (&conf->cond); - - if (conf->mutex_inited) - pthread_mutex_destroy (&conf->mutex); - stop_iot_watchdog (this); + stop_iot_reinit_ns_conf_thread (this); + + dict_unref (conf->hash_to_ns); + conf->hash_to_ns = NULL; GF_FREE (conf); @@ -1321,13 +1892,32 @@ fini (xlator_t *this) return; } +/* Clears a queue of requests from the given client (which is assumed to + * have disconnected or otherwise stopped needing these requests...) */ +void +clear_reqs_from_queue (xlator_t *this, client_t *client, struct list_head *queue) +{ + call_stub_t *curr; + call_stub_t *next; + + list_for_each_entry_safe (curr, next, queue, list) { + if (curr->frame->root->client != client) { + continue; + } + gf_log (this->name, GF_LOG_INFO, + "poisoning %s fop at %p for client %s", + gf_fop_list[curr->fop], curr, client->client_uid); + curr->poison = _gf_true; + } +} + static int iot_disconnect_cbk (xlator_t *this, client_t *client) { + iot_conf_t *conf = this->private; + data_pair_t *curr = NULL; + iot_ns_queue_t *curr_queue = NULL; int i; - call_stub_t *curr; - call_stub_t *next; - iot_conf_t *conf = this->private; if (!conf || !conf->cleanup_disconnected_reqs) { goto out; @@ -1335,15 +1925,15 @@ iot_disconnect_cbk (xlator_t *this, client_t *client) pthread_mutex_lock (&conf->mutex); for (i = 0; i < IOT_PRI_MAX; i++) { - list_for_each_entry_safe (curr, next, &conf->reqs[i], list) { - if (curr->frame->root->client != client) { - continue; + /* Clear client reqs from the unknown queue. */ + clear_reqs_from_queue (this, client, &conf->ns_unknown_queue[i].reqs); + /* Clear client reqs from each of the namespace queues. */ + if (conf->ns_weighted_queueing && conf->ns_queues) { + dict_foreach_inline (conf->ns_queues, curr) { + curr_queue = data_to_ptr (curr->value); + curr_queue = &curr_queue[i]; + clear_reqs_from_queue (this, client, &curr_queue->reqs); } - gf_log (this->name, GF_LOG_INFO, - "poisoning %s fop at %p for client %s", - gf_fop_list[curr->fop], curr, - client->client_uid); - curr->poison = _gf_true; } } pthread_mutex_unlock (&conf->mutex); @@ -1412,7 +2002,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, .max = IOT_MAX_THREADS, - .default_value = "16", + .default_value = "32", .description = "Number of threads in IO threads translator which " "perform concurrent IO operations" @@ -1435,7 +2025,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, .max = IOT_MAX_THREADS, - .default_value = "16", + .default_value = "32", .description = "Max number of threads in IO threads translator which " "perform high priority IO operations at a given time" @@ -1444,7 +2034,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, .max = IOT_MAX_THREADS, - .default_value = "16", + .default_value = "32", .description = "Max number of threads in IO threads translator which " "perform normal priority IO operations at a given time" @@ -1453,7 +2043,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, .max = IOT_MAX_THREADS, - .default_value = "16", + .default_value = "32", .description = "Max number of threads in IO threads translator which " "perform low priority IO operations at a given time" @@ -1477,14 +2067,6 @@ struct volume_options options[] = { .max = 0x7fffffff, .default_value = "120", }, - { .key = {"least-rate-limit"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = INT_MAX, - .default_value = "0", - .description = "Max number of least priority operations to handle " - "per-second" - }, { .key = {"watchdog-secs"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -1495,7 +2077,47 @@ struct volume_options options[] = { { .key = {"cleanup-disconnected-reqs"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", - .description = "'Poison' queued requests when a client disconnects" + .description = "Enable/Disable least priority" + }, + { .key = {"ns-weighted-queueing"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enables the namespace queues clock." + }, + { .key = {"ns-default-weight"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "100", + .description = "The default weight of a queue which doesn't have a " + "weight specified in the namespace conf. This is also " + "the weight of the unknown (default) queue." + }, + { .key = {"ns-weight-tolerance"}, + .type = GF_OPTION_TYPE_DOUBLE, + .default_value = "0.5", + .description = "The tolerance in percentage (out of 100) that the " + "slot-allocation algorithm will tolerate for weight/total " + "and slots/total percentages. This corresponds to " + "ideal and realized workload percentages allocated " + "to the namespace." + }, + { .key = {"ns-conf-reinit-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "5", + .description = "Number of seconds that the ns conf reinit thread " + "sleeps before trying to detect changes in the " + "namespace config file." + }, + { + .key = {"iam-brick-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the io-stats " + "translator is running as part of brick daemon " + "or not." }, { .key = {NULL}, }, diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index 4300cf673b2..5c3403a7153 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -11,6 +11,11 @@ #ifndef __IOT_H #define __IOT_H +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + #include "compat-errno.h" #include "glusterfs.h" @@ -25,6 +30,8 @@ #include <semaphore.h> #include "statedump.h" +#define GF_IO_THREADS "io-threads" +const char *_IOT_NAMESPACE_CONF = TOSTRING (GLUSTERD_WORKDIR) "/namespaces.conf"; struct iot_conf; @@ -38,23 +45,33 @@ struct iot_conf; #define IOT_MIN_FOP_PER_THREAD 0 #define IOT_MAX_FOP_PER_THREAD 2000 - -#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) - - -#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */ -struct iot_least_throttle { - struct timeval sample_time; /* timestamp of current sample */ - uint32_t sample_cnt; /* sample count for active interval */ - uint32_t cached_rate; /* the most recently measured rate */ - int32_t rate_limit; /* user-specified rate limit */ - pthread_mutex_t lock; -}; +/* A queue (well, typically a set of IOT_PRI_MAX queues) that corresponds to + * a namespace and a priority level. */ +typedef struct _iot_ns_queue { + uint32_t hash; /* Hash of namespace queue corresponds to. */ + uint32_t weight; /* Weight this queue should have in the clock. */ + double percentage; /* Percentage of total weight given to this queue. */ + uint32_t slots; /* Temp variable for number of slots to allocate to this queue. */ + uint32_t size; /* Number of reqs in the queue currently. */ + struct list_head reqs; /* Queue of fop call-stubs (requests) */ +} iot_ns_queue_t; + +/* A circular buffer which points to multiple queues which approximate a + * weighted round robin serving requests. */ +typedef struct _iot_ns_clock { + unsigned int idx; /* Current idx of position on clock. */ + size_t size; /* Size of clock. */ + iot_ns_queue_t **slots; /* Circular buffer of clock slots. */ +} iot_ns_clock_t; + +#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024*8)) struct iot_conf { pthread_mutex_t mutex; pthread_cond_t cond; + gf_boolean_t iambrickd; + int32_t max_count; /* configured maximum */ int32_t fops_per_thread_ratio; int32_t curr_count; /* actual number of threads running */ @@ -62,30 +79,39 @@ struct iot_conf { int32_t idle_time; /* in seconds */ - struct list_head reqs[IOT_PRI_MAX]; + gf_boolean_t ns_weighted_queueing; + uint32_t ns_default_weight; + double ns_weight_tolerance; + uint32_t ns_conf_reinit_secs; + time_t ns_conf_mtime; + dict_t *ns_queues; /* Queue for requsts for namespaces that were parsed. */ + dict_t *hash_to_ns; /* Hash key (string) -> Namespace string */ + iot_ns_clock_t ns_clocks[IOT_PRI_MAX]; /* Weighted Round Robin clocks (per priority). */ + iot_ns_queue_t ns_unknown_queue[IOT_PRI_MAX]; /* Queue for untagged requests (per priority). */ + + gf_boolean_t reinit_ns_conf_thread_running; + pthread_t reinit_ns_conf_thread; int32_t ac_iot_limit[IOT_PRI_MAX]; int32_t ac_iot_count[IOT_PRI_MAX]; int queue_sizes[IOT_PRI_MAX]; int queue_size; pthread_attr_t w_attr; - gf_boolean_t least_priority; /*Enable/Disable least-priority */ + gf_boolean_t least_priority; /* Enable/Disable least-priority */ - xlator_t *this; + xlator_t *this; size_t stack_size; - gf_boolean_t down; /*PARENT_DOWN event is notified*/ - gf_boolean_t mutex_inited; - gf_boolean_t cond_inited; - struct iot_least_throttle throttle; - int32_t watchdog_secs; gf_boolean_t watchdog_running; pthread_t watchdog_thread; gf_boolean_t queue_marked[IOT_PRI_MAX]; + gf_boolean_t cleanup_disconnected_reqs; }; typedef struct iot_conf iot_conf_t; +iot_pri_t iot_fop_to_pri (glusterfs_fop_t fop); + #endif /* __IOT_H */ |