summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@fb.com>2017-09-15 06:59:01 -0700
committerJeff Darcy <jdarcy@fb.com>2017-09-15 13:47:01 -0700
commit8dfdecf220d1c9365e1f8d6af9ead5e48c61e2eb (patch)
treebccd5906be43cf81792248b06099006525ed0c27
parente4b47b5d54644c398c424a99116a0cc37e4431d6 (diff)
Replace namespace/io-stats/io-threads with 3.6-fb versions
This rolls up multiple patches related to namespace identificaton and throttling/QoS. This primarily includes the following, all by Michael Goulet <mgoulet@fb.com>. io-threads: Add weighted round robin queueing by namespace https://phabricator.facebook.com/D5615269 io-threads: Add per-namespaces queue sizes to IO_THREADS_QUEUE_SIZE_KEY https://phabricator.facebook.com/D5683162 io-threads: Implement better slot allocation algorithm https://phabricator.facebook.com/D5683186 io-threads: Only enable weighted queueing on bricks https://phabricator.facebook.com/D5700062 io-threads: Update queue sizes on drain https://phabricator.facebook.com/D5704832 Fix parsing (-1) as default NS weight https://phabricator.facebook.com/D5723383 Parts of the following patches have also been applied to satisfy dependencies. io-throttling: Calculate moving averages and throttle offending hosts https://phabricator.fb.com/D2516161 Shreyas Siravara <sshreyas@fb.com> Hook up ODS logging for FUSE clients. https://phabricator.facebook.com/D3963376 Kevin Vigor <kvigor@fb.com> Add the flag --skip-nfsd-start to skip the NFS daemon stating, even if it is enabled https://phabricator.facebook.com/D4575368 Alex Lorca <alexlorca@fb.com> There are also some "standard" changes: dealing with code that moved, reindenting to comply with Gluster coding standards, gf_uuid_xxx, etc. This patch *does* revert some changes which have occurred upstream since 3.6; these will be re-applied as apppropriate on top of this new base. Change-Id: I69024115da7a60811e5b86beae781d602bdb558d Signed-off-by: Jeff Darcy <jdarcy@fb.com>
-rw-r--r--glusterfsd/src/glusterfsd.c8
-rw-r--r--glusterfsd/src/glusterfsd.h1
-rw-r--r--libglusterfs/src/common-utils.c28
-rw-r--r--libglusterfs/src/dict.c42
-rw-r--r--libglusterfs/src/dict.h16
-rw-r--r--libglusterfs/src/glusterfs.h11
-rw-r--r--libglusterfs/src/stack.c1
-rw-r--r--libglusterfs/src/stack.h15
-rw-r--r--tests/basic/afr/durability-off.t12
-rw-r--r--tests/basic/fop-sampling.t4
-rw-r--r--tests/basic/stats-dump.t4
-rw-r--r--xlators/debug/io-stats/src/Makefile.am6
-rw-r--r--xlators/debug/io-stats/src/io-stats-mem-types.h2
-rw-r--r--xlators/debug/io-stats/src/io-stats.c2487
-rw-r--r--xlators/debug/io-stats/src/io-stats.h381
-rw-r--r--xlators/features/namespace/src/namespace.c77
-rw-r--r--xlators/features/namespace/src/namespace.h1
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c2
-rw-r--r--xlators/performance/io-threads/src/Makefile.am9
-rw-r--r--xlators/performance/io-threads/src/io-threads.c1246
-rw-r--r--xlators/performance/io-threads/src/io-threads.h66
21 files changed, 3057 insertions, 1362 deletions
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
index 5022cfc22da..a0fec8f49a1 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -129,6 +129,10 @@ static struct argp_option gf_options[] = {
"buffer size, [default: 5]"},
{"log-flush-timeout", ARGP_LOG_FLUSH_TIMEOUT, "LOG-FLUSH-TIMEOUT", 0,
"Set log flush timeout, [default: 2 minutes]"},
+ {"stats-instance-name", ARGP_STATS_INSTANCE_NAME,
+ "STATS-INSTANCE-NAME", 0,
+ "Specify instance name for io-stats translator"},
+
{0, 0, 0, 0, "Advanced Options:"},
{"volfile-server-port", ARGP_VOLFILE_SERVER_PORT_KEY, "PORT", 0,
@@ -1246,6 +1250,10 @@ no_oom_api:
break;
+ case ARGP_STATS_INSTANCE_NAME:
+ cmd_args->stats_instance_name = gf_strdup (arg);
+ break;
+
case ARGP_LOG_FLUSH_TIMEOUT:
if (gf_string2uint32 (arg, &cmd_args->log_flush_timeout)) {
argp_failure (state, -1, 0,
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
index b5c6b27b534..940b351e552 100644
--- a/glusterfsd/src/glusterfsd.h
+++ b/glusterfsd/src/glusterfsd.h
@@ -96,6 +96,7 @@ enum argp_option_keys {
#ifdef GF_LINUX_HOST_OS
ARGP_OOM_SCORE_ADJ_KEY = 176,
#endif
+ ARGP_STATS_INSTANCE_NAME = 177,
};
struct _gfd_vol_top_priv_t {
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 84a0785d660..62c41a767d0 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -4349,9 +4349,11 @@ list_node_del (struct list_node *node)
GF_FREE (node);
}
-const char *
-fop_enum_to_pri_string (glusterfs_fop_t fop)
+iot_pri_t
+iot_fop_to_pri (glusterfs_fop_t fop)
{
+ iot_pri_t pri = IOT_PRI_MAX - 1;
+
switch (fop) {
case GF_FOP_OPEN:
case GF_FOP_STAT:
@@ -4365,7 +4367,8 @@ fop_enum_to_pri_string (glusterfs_fop_t fop)
case GF_FOP_READDIRP:
case GF_FOP_GETACTIVELK:
case GF_FOP_SETACTIVELK:
- return "HIGH";
+ pri = IOT_PRI_HI;
+ break;
case GF_FOP_CREATE:
case GF_FOP_FLUSH:
@@ -4391,7 +4394,8 @@ fop_enum_to_pri_string (glusterfs_fop_t fop)
case GF_FOP_FREMOVEXATTR:
case GF_FOP_IPC:
case GF_FOP_LEASE:
- return "NORMAL";
+ pri = IOT_PRI_NORMAL;
+ break;
case GF_FOP_READ:
case GF_FOP_WRITE:
@@ -4402,22 +4406,26 @@ fop_enum_to_pri_string (glusterfs_fop_t fop)
case GF_FOP_XATTROP:
case GF_FOP_FXATTROP:
case GF_FOP_RCHECKSUM:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
case GF_FOP_ZEROFILL:
- case GF_FOP_FALLOCATE:
case GF_FOP_SEEK:
- return "LOW";
+ pri = IOT_PRI_LO;
+ break;
case GF_FOP_NULL:
case GF_FOP_FORGET:
case GF_FOP_RELEASE:
case GF_FOP_RELEASEDIR:
case GF_FOP_GETSPEC:
+ case GF_FOP_COMPOUND:
case GF_FOP_MAXVALUE:
- case GF_FOP_DISCARD:
- return "LEAST";
- default:
- return "UNKNOWN";
+ //fail compilation on missing fop
+ //new fop must choose priority.
+ break;
}
+
+ return pri;
}
const char *
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c
index 6a61e641e19..82b9236e661 100644
--- a/libglusterfs/src/dict.c
+++ b/libglusterfs/src/dict.c
@@ -1258,6 +1258,38 @@ dict_foreach (dict_t *dict,
return ret;
}
+int
+dict_foreach_with_idx (dict_t *dict,
+ int (*fn)(dict_t *this,
+ char *key,
+ data_t *value,
+ void *data, uint64_t idx),
+ void *data)
+{
+ if (!dict) {
+ gf_log_callingfn ("dict", GF_LOG_WARNING,
+ "dict is NULL");
+ return -1;
+ }
+
+ uint64_t idx = 0;
+ int ret = -1;
+ data_pair_t *pairs = NULL;
+ data_pair_t *next = NULL;
+
+ pairs = dict->members_list;
+ while (pairs) {
+ next = pairs->next;
+ ret = fn (dict, pairs->key, pairs->value, data, idx);
+ if (ret < 0)
+ return ret;
+ pairs = next;
+ idx++;
+ }
+
+ return 0;
+}
+
/* return values:
-1 = failure,
0 = no matches found,
@@ -2979,6 +3011,16 @@ dict_dump_to_str (dict_t *dict, char *dump, int dumpsize, char *format)
return 0;
}
+/* This function converts a uint32 to a (string) key for use in a dictionary.
+ * Ensure that the key string buffer is at least DICT_UINT32_KEY_SIZE in
+ * length, since that's the maximum length of a uint32's string representation
+ * plus a NULL delimiter char. */
+void
+dict_uint32_to_key (uint32_t num, char *key_buf)
+{
+ snprintf (key_buf, DICT_UINT32_KEY_SIZE, "%u", num);
+}
+
void
dict_dump_to_log (dict_t *dict)
{
diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h
index 5259c6befa1..da95bc86bec 100644
--- a/libglusterfs/src/dict.h
+++ b/libglusterfs/src/dict.h
@@ -22,7 +22,6 @@ typedef struct _data data_t;
typedef struct _dict dict_t;
typedef struct _data_pair data_pair_t;
-
#define GF_PROTOCOL_DICT_SERIALIZE(this,from_dict,to,len,ope,labl) do { \
int ret = 0; \
\
@@ -161,6 +160,8 @@ dict_t *get_new_dict ();
#define dict_for_each(d, c) for (c = d->members_list; c; c = c->next)
+#define dict_foreach_inline(d, c) for (c = d->members_list; c; c = c->next)
+
int dict_foreach (dict_t *this,
int (*fn)(dict_t *this,
char *key,
@@ -168,6 +169,13 @@ int dict_foreach (dict_t *this,
void *data),
void *data);
+int dict_foreach_with_idx (dict_t *this,
+ int (*fn)(dict_t *this,
+ char *key,
+ data_t *value,
+ void *data, uint64_t idx),
+ void *data);
+
int dict_foreach_fnmatch (dict_t *dict, char *pattern,
int (*fn)(dict_t *this,
char *key,
@@ -246,6 +254,12 @@ GF_MUST_CHECK int dict_get_str (dict_t *this, char *key, char **str);
GF_MUST_CHECK int dict_get_str_boolean (dict_t *this, char *key, int default_val);
GF_MUST_CHECK int dict_serialize_value_with_delim (dict_t *this, char *buf, int32_t *serz_len,
char delimiter);
+/* Log_10(2^32) + 1. This is the length of the longest string representation of
+ * a 32-bit integer, plus space for '\0'. */
+#define DICT_UINT32_KEY_SIZE 11
+
+void dict_uint32_to_key (uint32_t num, char *key_buf);
+
void
dict_dump_to_statedump (dict_t *dict, char *dict_name, char *domain);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 0b033d8bfcf..f618c7aba19 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -331,7 +331,14 @@ static inline const char *fop_pri_to_string (gf_fop_pri_t pri)
return FOP_PRI_STRINGS[pri];
}
-const char *fop_enum_to_pri_string (glusterfs_fop_t fop);
+iot_pri_t iot_fop_to_pri (glusterfs_fop_t fop);
+
+static inline const char *fop_enum_to_pri_string (glusterfs_fop_t fop)
+{
+ iot_pri_t pri = iot_fop_to_pri (fop);
+ return fop_pri_to_string (pri);
+}
+
const char *fop_enum_to_string (glusterfs_fop_t fop);
#define GF_SET_IF_NOT_PRESENT 0x1 /* default behaviour */
@@ -444,6 +451,8 @@ struct _cmd_args {
#ifdef GF_LINUX_HOST_OS
char *oom_score_adj;
#endif
+
+ char *stats_instance_name;
};
typedef struct _cmd_args cmd_args_t;
diff --git a/libglusterfs/src/stack.c b/libglusterfs/src/stack.c
index 6977814ec69..adcedd4cc96 100644
--- a/libglusterfs/src/stack.c
+++ b/libglusterfs/src/stack.c
@@ -36,6 +36,7 @@ create_frame (xlator_t *xl, call_pool_t *pool)
frame->root = stack;
frame->this = xl;
+ frame->pri = GF_FOP_PRI_UNSPEC;
LOCK_INIT (&frame->lock);
INIT_LIST_HEAD (&frame->frames);
list_add (&frame->frames, &stack->myframes);
diff --git a/libglusterfs/src/stack.h b/libglusterfs/src/stack.h
index 9e5355a6044..094dc62312c 100644
--- a/libglusterfs/src/stack.h
+++ b/libglusterfs/src/stack.h
@@ -77,6 +77,8 @@ struct _call_frame_t {
const char *wind_to;
const char *unwind_from;
const char *unwind_to;
+
+ gf_fop_pri_t pri;
};
struct _ns_info {
@@ -122,6 +124,16 @@ struct _call_stack_t {
ns_info_t ns_info;
};
+#define frame_set_throttling(frm, should_throttle) \
+ do { \
+ if (frm) { \
+ if (should_throttle) { \
+ frm->pri = IOT_PRI_LEAST; \
+ } else { \
+ frm->pri = IOT_PRI_UNSPEC; \
+ } \
+ } \
+ } while (0)
#define frame_set_uid_gid(frm, u, g) \
do { \
@@ -259,6 +271,7 @@ STACK_RESET (call_stack_t *stack)
_new->wind_from = __FUNCTION__; \
_new->wind_to = #fn; \
_new->unwind_to = #rfn; \
+ _new->pri = frame->pri; \
\
LOCK_INIT (&_new->lock); \
LOCK(&frame->root->stack_lock); \
@@ -321,6 +334,8 @@ STACK_RESET (call_stack_t *stack)
_new->wind_from = __FUNCTION__; \
_new->wind_to = #fn; \
_new->unwind_to = #rfn; \
+ _new->pri = frame->pri; \
+ \
LOCK_INIT (&_new->lock); \
LOCK(&frame->root->stack_lock); \
{ \
diff --git a/tests/basic/afr/durability-off.t b/tests/basic/afr/durability-off.t
index 155ffa09ef0..0c4f470079a 100644
--- a/tests/basic/afr/durability-off.t
+++ b/tests/basic/afr/durability-off.t
@@ -5,6 +5,14 @@
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
+did_fsync () {
+ local count=$($CLI volume profile $V0 info | grep -w FSYNC | wc -l)
+ if [ "$count" != "0" ]; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
cleanup;
TEST glusterd
@@ -24,7 +32,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
-EXPECT "^0$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l)
+EXPECT "N" did_fsync
#Test that fsyncs happen when durability is on
TEST $CLI volume set $V0 cluster.ensure-durability on
@@ -39,6 +47,6 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
-EXPECT "^2$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l)
+EXPECT "Y" did_fsync
cleanup;
diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t
index e429fd8cb07..a1b3edc3d5d 100644
--- a/tests/basic/fop-sampling.t
+++ b/tests/basic/fop-sampling.t
@@ -13,7 +13,7 @@ function check_path {
op=$1
path=$2
file=$3
- grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null
+ grep $op $file | awk -F, '{print $12}' | grep $path 2>&1 > /dev/null
if [ $? -eq 0 ]; then
echo "Y"
else
@@ -106,6 +106,8 @@ for dir in "$N0" "$M0"; do
rm $dir/file2
done;
+read -p "Continue? " nothing
+
EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES
EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES
EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES
diff --git a/tests/basic/stats-dump.t b/tests/basic/stats-dump.t
index af1ad34702b..a188a45eea1 100644
--- a/tests/basic/stats-dump.t
+++ b/tests/basic/stats-dump.t
@@ -39,7 +39,9 @@ FUSE_RET="$?"
# Test that io-stats is getting queue sizes from io-threads
TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfs_nfsd_$V0.dump
-TEST ! grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump
+
+# We should be getting queue sizes on bricks now too.
+TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump
TEST [ 0 -ne "$BRICK_RET" ]
TEST [ 0 -ne "$NFSD_RET" ]
diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am
index c5df598549a..5d39afc2a14 100644
--- a/xlators/debug/io-stats/src/Makefile.am
+++ b/xlators/debug/io-stats/src/Makefile.am
@@ -2,17 +2,17 @@
xlator_LTLIBRARIES = io-stats.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-io_stats_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+io_stats_la_LDFLAGS = -module -avoid-version
io_stats_la_SOURCES = io-stats.c
io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = io-stats-mem-types.h
+noinst_HEADERS = io-stats-mem-types.h io-stats.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/rpc/xdr/src \
-I$(top_srcdir)/rpc/rpc-lib/src \
- -DDATADIR=\"$(localstatedir)\"
+ -DGLUSTERD_WORKDIR=$(GLUSTERD_WORKDIR)
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/debug/io-stats/src/io-stats-mem-types.h b/xlators/debug/io-stats/src/io-stats-mem-types.h
index 9dde9373264..af1e59abc2c 100644
--- a/xlators/debug/io-stats/src/io-stats-mem-types.h
+++ b/xlators/debug/io-stats/src/io-stats-mem-types.h
@@ -13,8 +13,6 @@
#include "mem-types.h"
-extern const char *__progname;
-
enum gf_io_stats_mem_types_ {
gf_io_stats_mt_ios_conf = gf_common_mt_end + 1,
gf_io_stats_mt_ios_fd,
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index e3511233203..67edd36e611 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -7,8 +7,11 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
#include "xlator.h"
-#include "syscall.h"
+#endif
/**
* xlators/debug/io_stats :
@@ -35,365 +38,54 @@
#include "logging.h"
#include "cli1-xdr.h"
#include "statedump.h"
+#include "io-stats.h"
#include "syncop.h"
+#include "hashfn.h"
#include <pwd.h>
#include <grp.h>
-#define MAX_LIST_MEMBERS 100
-#define DEFAULT_PWD_BUF_SZ 16384
-#define DEFAULT_GRP_BUF_SZ 16384
-#define IOS_MAX_ERRORS 132
-
-typedef enum {
- IOS_STATS_TYPE_NONE,
- IOS_STATS_TYPE_OPEN,
- IOS_STATS_TYPE_READ,
- IOS_STATS_TYPE_WRITE,
- IOS_STATS_TYPE_OPENDIR,
- IOS_STATS_TYPE_READDIRP,
- IOS_STATS_TYPE_READ_THROUGHPUT,
- IOS_STATS_TYPE_WRITE_THROUGHPUT,
- IOS_STATS_TYPE_MAX
-}ios_stats_type_t;
-
-typedef enum {
- IOS_STATS_THRU_READ,
- IOS_STATS_THRU_WRITE,
- IOS_STATS_THRU_MAX,
-}ios_stats_thru_t;
-
-struct ios_stat_lat {
- struct timeval time;
- double throughput;
-};
-
-struct ios_stat {
- gf_lock_t lock;
- uuid_t gfid;
- char *filename;
- uint64_t counters [IOS_STATS_TYPE_MAX];
- struct ios_stat_lat thru_counters [IOS_STATS_THRU_MAX];
- int refcnt;
-};
-
-struct ios_stat_list {
- struct list_head list;
- struct ios_stat *iosstat;
- double value;
-};
-
-struct ios_stat_head {
- gf_lock_t lock;
- double min_cnt;
- uint64_t members;
- struct ios_stat_list *iosstats;
-};
-
-typedef struct _ios_sample_t {
- uid_t uid;
- gid_t gid;
- char identifier[UNIX_PATH_MAX];
- char path[UNIX_PATH_MAX];
- glusterfs_fop_t fop_type;
- struct timeval timestamp;
- double elapsed;
- gf_boolean_t have_path;
- int32_t op_ret;
- int32_t op_errno;
-} ios_sample_t;
-
-
-typedef struct _ios_sample_buf_t {
- uint64_t pos; /* Position in write buffer */
- uint64_t size; /* Size of ring buffer */
- uint64_t collected; /* Number of samples we've collected */
- uint64_t observed; /* Number of FOPs we've observed */
- ios_sample_t *ios_samples; /* Our list of samples */
-} ios_sample_buf_t;
-
-
-struct ios_lat {
- double min;
- double max;
- double avg;
- uint64_t total;
-};
-
-struct ios_global_stats {
- uint64_t data_written;
- uint64_t data_read;
- uint64_t block_count_write[32];
- uint64_t block_count_read[32];
- uint64_t fop_hits[GF_FOP_MAXVALUE];
- struct timeval started_at;
- struct ios_lat latency[GF_FOP_MAXVALUE];
- uint64_t errno_count[IOS_MAX_ERRORS];
- uint64_t nr_opens;
- uint64_t max_nr_opens;
- struct timeval max_openfd_time;
-};
-
-/* This is a list of errors which are in some way critical.
- * It is useful to sample these errors even if other errors
- * should be ignored. */
-const int32_t ios_hard_error_list[] = {
- EIO,
- EROFS,
- ENOSPC,
- ENOTCONN,
- ESTALE,
-};
-
-#define IOS_HARD_ERROR_LIST_SIZE (sizeof(ios_hard_error_list) / sizeof(int32_t))
-
-const char *errno_to_name[IOS_MAX_ERRORS] = {
- "success", /* 0 */
- "eperm",
- "enoent",
- "esrch",
- "eintr",
- "eio",
- "enxio",
- "e2big",
- "enoexec",
- "ebadf",
- "echild",
- "eagain",
- "enomem",
- "eacces",
- "efault",
- "enotblk",
- "ebusy",
- "eexist",
- "exdev",
- "enodev",
- "enotdir",
- "eisdir",
- "einval",
- "enfile",
- "emfile",
- "enotty",
- "etxtbsy",
- "efbig",
- "enospc",
- "espipe",
- "erofs",
- "emlink",
- "epipe",
- "edom",
- "erange",
- "edeadlk",
- "enametoolong",
- "enolck",
- "enosys",
- "enotempty",
- "eloop",
- "ewouldblock",
- "enomsg",
- "eidrm",
- "echrng",
- "el2nsync",
- "el3hlt",
- "el3rst",
- "elnrng",
- "eunatch",
- "enocsi",
- "el2hlt",
- "ebade",
- "ebadr",
- "exfull",
- "enoano",
- "ebadrqc",
- "ebadslt",
- "edeadlock",
- "ebfont",
- "enostr",
- "enodata",
- "etime",
- "enosr",
- "enonet",
- "enopkg",
- "eremote",
- "enolink",
- "eadv",
- "esrmnt",
- "ecomm",
- "eproto",
- "emultihop",
- "edotdot",
- "ebadmsg",
- "eoverflow",
- "enotuniq",
- "ebadfd",
- "eremchg",
- "elibacc",
- "elibbad",
- "elibscn",
- "elibmax",
- "elibexec",
- "eilseq",
- "erestart",
- "estrpipe",
- "eusers",
- "enotsock",
- "edestaddrreq",
- "emsgsize",
- "eprototype",
- "enoprotoopt",
- "eprotonosupport",
- "esocktnosupport",
- "eopnotsupp",
- "epfnosupport",
- "eafnosupport",
- "eaddrinuse",
- "eaddrnotavail",
- "enetdown",
- "enetunreach",
- "enetreset",
- "econnaborted",
- "econnreset",
- "enobufs",
- "eisconn",
- "enotconn",
- "eshutdown",
- "etoomanyrefs",
- "etimedout",
- "econnrefused",
- "ehostdown",
- "ehostunreach",
- "ealready",
- "einprogress",
- "estale",
- "euclean",
- "enotnam",
- "enavail",
- "eisnam",
- "eremoteio",
- "edquot",
- "enomedium",
- "emediumtype",
- "ecanceled",
- "enokey",
- "ekeyexpired",
- "ekeyrevoked",
- "ekeyrejected",
- "eownerdead",
- "enotrecoverable"
-};
-
-struct ios_conf {
- gf_lock_t lock;
- struct ios_global_stats cumulative;
- uint64_t increment;
- struct ios_global_stats incremental;
- gf_boolean_t dump_fd_stats;
- gf_boolean_t count_fop_hits;
- gf_boolean_t measure_latency;
- struct ios_stat_head list[IOS_STATS_TYPE_MAX];
- struct ios_stat_head thru_list[IOS_STATS_THRU_MAX];
- int32_t ios_dump_interval;
- pthread_t dump_thread;
- gf_boolean_t dump_thread_should_die;
- gf_lock_t ios_sampling_lock;
- int32_t ios_sample_interval;
- int32_t ios_sample_buf_size;
- ios_sample_buf_t *ios_sample_buf;
- struct dnscache *dnscache;
- int32_t ios_dnscache_ttl_sec;
- gf_boolean_t iamshd;
- gf_boolean_t iamnfsd;
- gf_boolean_t iambrickd;
- gf_boolean_t iamgfproxyd;
- gf_boolean_t audit_creates_and_unlinks;
- gf_boolean_t sample_hard_errors;
- gf_boolean_t sample_all_errors;
- uint32_t outstanding_req;
- gf_boolean_t dump_percentile_latencies;
-};
-
-
-struct ios_fd {
- char *filename;
- uint64_t data_written;
- uint64_t data_read;
- uint64_t block_count_write[32];
- uint64_t block_count_read[32];
- struct timeval opened_at;
-};
+const char *get_ns_from_hash (struct ios_conf *conf, const ns_info_t *info);
+int ios_conf_load_namespace_conf (struct ios_conf *conf);
-typedef enum {
- IOS_DUMP_TYPE_NONE = 0,
- IOS_DUMP_TYPE_FILE = 1,
- IOS_DUMP_TYPE_DICT = 2,
- IOS_DUMP_TYPE_JSON_FILE = 3,
- IOS_DUMP_TYPE_SAMPLES = 4,
- IOS_DUMP_TYPE_MAX = 5
-} ios_dump_type_t;
-
-struct ios_dump_args {
- ios_dump_type_t type;
- union {
- FILE *logfp;
- dict_t *dict;
- } u;
-};
+gf_boolean_t should_throttle_fop (call_frame_t *frame, xlator_t *this);
-typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*,
- int , int , uint64_t ) ;
+int _ios_swap_sample_buf (xlator_t *this, ios_sample_buf_t **dest);
-struct ios_local {
- inode_t *inode;
- loc_t loc;
- fd_t *fd;
-};
-
-static struct ios_local *
-ios_local_new() {
- return GF_CALLOC (1, sizeof (struct ios_local),
- gf_common_mt_char);
-}
-
-static void
-ios_local_free (struct ios_local *local)
+/* A lot of io-stats logging code depends on the io-stats translator needing
+ * the top xlator's name, which is special. For example, on bricks, the name
+ * is the path to the backend storage directory.
+ *
+ * Fails gracefully if there is no xlator name, returning "Unknown". */
+const char *
+get_top_xlator_name (xlator_t *this)
{
- if (!local)
- return;
-
- inode_unref (local->inode);
+ const char *name = NULL;
- if (local->fd)
- fd_unref (local->fd);
-
- loc_wipe (&local->loc);
- memset (local, 0, sizeof (*local));
- GF_FREE (local);
-}
+ while (this->parents && this->parents->xlator) {
+ this = this->parents->xlator;
+ }
-struct volume_options options[];
+ this = FIRST_CHILD (this);
-static int
-is_fop_latency_started (call_frame_t *frame)
-{
- GF_ASSERT (frame);
- struct timeval epoch = {0,};
- return memcmp (&frame->begin, &epoch, sizeof (epoch));
+ if (this && this->name) {
+ return this->name;
+ } else {
+ gf_log (GF_IO_STATS, GF_LOG_ERROR,
+ "Cannot retrieve top-of-stack translator name.");
+ return "Unknown";
+ }
}
-static void
-ios_free_local (call_frame_t *frame)
+static void ios_free_local (call_frame_t *frame)
{
struct ios_local *local = frame->local;
-
ios_local_free (local);
-
frame->local = NULL;
}
-static void
-ios_track_loc (call_frame_t *frame, loc_t *loc)
+static void ios_track_loc (call_frame_t *frame, loc_t *loc)
{
struct ios_local *local = NULL;
-
if (loc && loc->path) {
/* Check if frame->local is already set (it should
* only be set by either ios_track_loc() or
@@ -412,11 +104,9 @@ ios_track_loc (call_frame_t *frame, loc_t *loc)
}
}
-static void
-ios_track_fd (call_frame_t *frame, fd_t *fd)
+static void ios_track_fd (call_frame_t *frame, fd_t *fd)
{
struct ios_local *local = NULL;
-
if (fd && fd->inode) {
if (frame->local) {
local = frame->local;
@@ -429,14 +119,21 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
}
}
-
-#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"
-#ifdef GF_LINUX_HOST_OS
-#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats"
+#ifdef HAVE_ATOMIC_BUILTINS
+#define INC_COUNTER(counter, inc, lock) \
+ (__sync_fetch_and_add (&(counter), (inc)))
#else
-#define _IOS_DUMP_DIR DATADIR "/db/glusterd/stats"
+#define INC_COUNTER(counter, inc, lock) \
+ do { \
+ LOCK (&(lock)); \
+ { \
+ counter += (inc); \
+ } \
+ UNLOCK (&(lock)); \
+ } while (0)
#endif
+#define FOP_THROTTLE(frame, this) frame_set_throttling (frame, should_throttle_fop (frame, this))
#define END_FOP_LATENCY(frame, op) \
do { \
@@ -455,7 +152,7 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
\
conf = this->private; \
if (conf && conf->measure_latency) { \
- STATS_INC (conf->outstanding_req); \
+ INC_COUNTER (conf->outstanding_req, 1, conf->lock);\
gettimeofday (&frame->begin, NULL); \
} else { \
memset (&frame->begin, 0, sizeof (frame->begin));\
@@ -474,24 +171,30 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
conf->incremental.fop_hits[GF_FOP_##op]++; \
} while (0)
-
-#if defined(HAVE_ATOMIC_BUILTINS)
-#define STATS_LOCK(x)
-#define STATS_UNLOCK(x)
-#define STATS_ADD(x,i) __sync_add_and_fetch (&x, i)
-#define STATS_SUB(x,i) __sync_sub_and_fetch (&x, i)
-#define STATS_INC(x) STATS_ADD(x, 1)
-#define STATS_DEC(x) STATS_SUB(x, 1)
+#ifdef HAVE_ATOMIC_BUILTINS
+#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ \
+ if (!is_fop_latency_started (frame)) \
+ break; \
+ conf = this->private; \
+ if (conf && conf->measure_latency && \
+ conf->count_fop_hits) { \
+ BUMP_FOP (op); \
+ __sync_fetch_and_sub (&conf->outstanding_req, 1); \
+ if (op_ret != 0 && op_errno > 0 \
+ && op_errno < IOS_MAX_ERRORS) { \
+ __sync_fetch_and_add ( \
+ &conf->cumulative.errno_count[op_errno], 1); \
+ __sync_fetch_and_add ( \
+ &conf->incremental.errno_count[op_errno], 1); \
+ } \
+ gettimeofday (&frame->end, NULL); \
+ update_ios_latency (conf, frame, GF_FOP_##op, op_ret, op_errno); \
+ } \
+ } while (0)
#else
-#define STATS_LOCK(x) LOCK (x)
-#define STATS_UNLOCK(x) UNLOCK (x)
-#define STATS_ADD(x,i) (x) += (i)
-#define STATS_SUB(x,i) (x) -= (i)
-#define STATS_INC(x) (x) += 1
-#define STATS_DEC(x) (x) -= 1
-#endif
-
-
#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \
do { \
struct ios_conf *conf = NULL; \
@@ -499,94 +202,200 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
if (!is_fop_latency_started (frame)) \
break; \
conf = this->private; \
- STATS_LOCK (&conf->lock); \
+ LOCK (&conf->lock); \
{ \
if (conf && conf->measure_latency && \
conf->count_fop_hits) { \
BUMP_FOP(op); \
- STATS_DEC (conf->outstanding_req); \
+ conf->outstanding_req -= 1; \
if (op_ret != 0 && op_errno > 0 \
&& op_errno < IOS_MAX_ERRORS) { \
- STATS_INC(conf->cumulative.errno_count[op_errno]); \
- STATS_INC(conf->incremental.errno_count[op_errno]); \
+ conf->cumulative.errno_count[op_errno]++; \
+ conf->incremental.errno_count[op_errno]++; \
} \
gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op, \
- op_ret, op_errno); \
+ update_ios_latency (conf, frame, GF_FOP_##op, op_ret, op_errno);\
} \
} \
- STATS_UNLOCK (&conf->lock); \
+ UNLOCK (&conf->lock); \
} while (0)
+#endif
-#define BUMP_READ(fd, len) \
- do { \
- struct ios_conf *conf = NULL; \
- struct ios_fd *iosfd = NULL; \
- int lb2 = 0; \
- \
- conf = this->private; \
- lb2 = log_base2 (len); \
- ios_fd_ctx_get (fd, this, &iosfd); \
- if (!conf) \
- break; \
- \
- STATS_LOCK (&conf->lock); \
- { \
- STATS_ADD (conf->cumulative.data_read, len); \
- STATS_ADD (conf->incremental.data_read, len); \
- STATS_ADD (conf->cumulative.block_count_read[lb2], 1); \
- STATS_ADD (conf->incremental.block_count_read[lb2], 1);\
- \
- if (iosfd) { \
- STATS_ADD (iosfd->data_read, len); \
- STATS_ADD (iosfd->block_count_read[lb2], 1); \
- } \
- } \
- STATS_UNLOCK (&conf->lock); \
+#ifdef HAVE_ATOMIC_BUILTINS
+#define BUMP_READ(fd, len) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ struct ios_fd *iosfd = NULL; \
+ int lb2 = 0; \
+ \
+ conf = this->private; \
+ lb2 = log_base2 (len); \
+ ios_fd_ctx_get (fd, this, &iosfd); \
+ if (!conf) \
+ break; \
+ \
+ __sync_fetch_and_add (&conf->cumulative.data_read, \
+ len); \
+ __sync_fetch_and_add (&conf->incremental.data_read, \
+ len); \
+ __sync_fetch_and_add ( \
+ &conf->cumulative.block_count_read[lb2], 1); \
+ __sync_fetch_and_add ( \
+ &conf->incremental.block_count_read[lb2], 1); \
+ if (iosfd) { \
+ __sync_fetch_and_add (&iosfd->data_read, len); \
+ __sync_fetch_and_add ( \
+ &iosfd->block_count_read[lb2], 1); \
+ } \
} while (0)
-
-#define BUMP_WRITE(fd, len) \
- do { \
- struct ios_conf *conf = NULL; \
- struct ios_fd *iosfd = NULL; \
- int lb2 = 0; \
- \
- conf = this->private; \
- lb2 = log_base2 (len); \
- ios_fd_ctx_get (fd, this, &iosfd); \
- if (!conf) \
- break; \
- STATS_LOCK (&conf->lock); \
- { \
- STATS_ADD (conf->cumulative.data_written, len); \
- STATS_ADD (conf->incremental.data_written, len); \
- STATS_ADD (conf->cumulative.block_count_write[lb2], 1);\
- STATS_ADD (conf->incremental.block_count_write[lb2], 1);\
- \
- if (iosfd) { \
- STATS_ADD (iosfd->data_written, len); \
- STATS_ADD (iosfd->block_count_write[lb2], 1); \
- } \
- } \
- STATS_UNLOCK (&conf->lock); \
+#else
+#define BUMP_READ(fd, len) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ struct ios_fd *iosfd = NULL; \
+ int lb2 = 0; \
+ \
+ conf = this->private; \
+ lb2 = log_base2 (len); \
+ ios_fd_ctx_get (fd, this, &iosfd); \
+ if (!conf) \
+ break; \
+ \
+ LOCK (&conf->lock); \
+ { \
+ conf->cumulative.data_read += len; \
+ conf->incremental.data_read += len; \
+ conf->cumulative.block_count_read[lb2]++; \
+ conf->incremental.block_count_read[lb2]++; \
+ \
+ if (iosfd) { \
+ iosfd->data_read += len; \
+ iosfd->block_count_read[lb2]++; \
+ } \
+ } \
+ UNLOCK (&conf->lock); \
} while (0)
+#endif
-#define BUMP_STATS(iosstat, type) \
+#ifdef HAVE_ATOMIC_BUILTINS
+#define BUMP_WRITE(fd, len) \
do { \
- struct ios_conf *conf = NULL; \
- uint64_t value = 0; \
+ struct ios_conf *conf = NULL; \
+ struct ios_fd *iosfd = NULL; \
+ int lb2 = 0; \
\
conf = this->private; \
+ lb2 = log_base2 (len); \
+ ios_fd_ctx_get (fd, this, &iosfd); \
+ if (!conf) \
+ break; \
+ __sync_fetch_and_add (&conf->cumulative.data_written, \
+ len); \
+ __sync_fetch_and_add (&conf->incremental.data_written, \
+ len); \
+ __sync_fetch_and_add ( \
+ &conf->cumulative.block_count_write[lb2], 1); \
+ __sync_fetch_and_add ( \
+ &conf->incremental.block_count_write[lb2], 1); \
+ \
+ if (iosfd) { \
+ __sync_fetch_and_add (&iosfd->data_written, \
+ len); \
+ __sync_fetch_and_add ( \
+ &iosfd->block_count_write[lb2], 1); \
+ } \
+ } while (0)
+#else
+#define BUMP_WRITE(fd, len) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ struct ios_fd *iosfd = NULL; \
+ int lb2 = 0; \
\
- LOCK(&iosstat->lock); \
+ conf = this->private; \
+ lb2 = log_base2 (len); \
+ ios_fd_ctx_get (fd, this, &iosfd); \
+ if (!conf) \
+ break; \
+ LOCK (&conf->lock); \
{ \
- value = STATS_ADD (iosstat->counters[type], 1); \
+ conf->cumulative.data_written += len; \
+ conf->incremental.data_written += len; \
+ conf->cumulative.block_count_write[lb2]++; \
+ conf->incremental.block_count_write[lb2]++; \
+ \
+ if (iosfd) { \
+ iosfd->data_written += len; \
+ iosfd->block_count_write[lb2]++; \
+ } \
} \
- UNLOCK (&iosstat->lock); \
- ios_stat_add_to_list (&conf->list[type], \
- value, iosstat); \
+ UNLOCK (&conf->lock); \
} while (0)
+#endif
+#ifdef HAVE_ATOMIC_BUILTINS
+#define BUMP_STATS(iosstat, type) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ uint64_t value = 0; \
+ \
+ conf = this->private; \
+ value = __sync_add_and_fetch (&iosstat->counters[type], 1); \
+ ios_stat_add_to_list (&conf->list[type], \
+ value, iosstat); \
+ \
+ } while (0)
+#else
+#define BUMP_STATS(iosstat, type) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ uint64_t value = 0; \
+ \
+ conf = this->private; \
+ \
+ LOCK(&iosstat->lock); \
+ { \
+ iosstat->counters[type]++; \
+ value = iosstat->counters[type]; \
+ } \
+ UNLOCK (&iosstat->lock); \
+ ios_stat_add_to_list (&conf->list[type], \
+ value, iosstat); \
+ \
+ } while (0)
+#endif
+
+#ifdef HAVE_ATOMIC_BUILTINS
+#define BUMP_THROUGHPUT(iosstat, type) \
+ do { \
+ struct ios_conf *conf = NULL; \
+ double elapsed; \
+ struct timeval *begin, *end; \
+ double throughput; \
+ int flag = 0; \
+ \
+ begin = &frame->begin; \
+ end = &frame->end; \
+ \
+ elapsed = (end->tv_sec - begin->tv_sec) * 1e6 \
+ + (end->tv_usec - begin->tv_usec); \
+ throughput = op_ret / elapsed; \
+ \
+ conf = this->private; \
+ \
+ if (iosstat->thru_counters[type].throughput \
+ <= throughput) { \
+ iosstat->thru_counters[type].throughput = \
+ throughput; \
+ gettimeofday (&iosstat-> \
+ thru_counters[type].time, NULL); \
+ flag = 1; \
+ } \
+ if (flag) \
+ ios_stat_add_to_list (&conf->thru_list[type], \
+ throughput, iosstat); \
+ } while (0)
+#else
#define BUMP_THROUGHPUT(iosstat, type) \
do { \
struct ios_conf *conf = NULL; \
@@ -603,7 +412,7 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
throughput = op_ret / elapsed; \
\
conf = this->private; \
- STATS_LOCK (&iosstat->lock); \
+ LOCK (&iosstat->lock); \
{ \
if (iosstat->thru_counters[type].throughput \
<= throughput) { \
@@ -614,11 +423,79 @@ ios_track_fd (call_frame_t *frame, fd_t *fd)
flag = 1; \
} \
} \
- STATS_UNLOCK (&iosstat->lock); \
- if (flag) \
+ UNLOCK (&iosstat->lock); \
+ if (flag) \
ios_stat_add_to_list (&conf->thru_list[type], \
throughput, iosstat); \
} while (0)
+#endif
+
+
+/*
+ * So why goto all this trouble? Why not just queue up some samples in
+ * a big list and malloc away? Well malloc is expensive relative
+ * to what we are measuring, so cannot have any malloc's (or worse
+ * callocs) in our measurement code paths. Instead, we are going to
+ * pre-allocate a circular buffer and collect a maximum number of samples.
+ * Prior to dumping them all we'll create a new buffer and swap the
+ * old buffer with the new, and then proceed to dump the statistics
+ * in our dump thread.
+ *
+ */
+ios_sample_buf_t *
+ios_create_sample_buf (size_t buf_size)
+{
+ ios_sample_buf_t *ios_sample_buf = NULL;
+ ios_sample_t *ios_samples = NULL;
+
+ ios_sample_buf = GF_CALLOC (1,
+ sizeof (*ios_sample_buf),
+ gf_io_stats_mt_ios_sample_buf);
+ if (!ios_sample_buf)
+ goto err;
+
+ ios_samples = GF_CALLOC (buf_size,
+ sizeof (*ios_samples),
+ gf_io_stats_mt_ios_sample);
+
+ if (!ios_samples)
+ goto err;
+
+ ios_sample_buf->ios_samples = ios_samples;
+ ios_sample_buf->size = buf_size;
+ ios_sample_buf->pos = 0;
+ ios_sample_buf->observed = 0;
+ ios_sample_buf->collected = 0;
+
+ return ios_sample_buf;
+err:
+ GF_FREE (ios_sample_buf);
+ return NULL;
+}
+
+void
+ios_destroy_sample_buf (ios_sample_buf_t *ios_sample_buf)
+{
+ GF_FREE (ios_sample_buf->ios_samples);
+ GF_FREE (ios_sample_buf);
+}
+
+static int
+ios_init_sample_buf (struct ios_conf *conf)
+{
+ int32_t ret = -1;
+
+ GF_ASSERT (conf);
+ LOCK (&conf->lock);
+ conf->ios_sample_buf = ios_create_sample_buf (
+ conf->ios_sample_buf_size);
+ if (!conf->ios_sample_buf)
+ goto out;
+ ret = 0;
+out:
+ UNLOCK (&conf->lock);
+ return ret;
+}
int
ios_fd_ctx_get (fd_t *fd, xlator_t *this, struct ios_fd **iosfd)
@@ -715,72 +592,6 @@ ios_inode_ctx_get (inode_t *inode, xlator_t *this, struct ios_stat **iosstat)
}
-/*
- * So why goto all this trouble? Why not just queue up some samples in
- * a big list and malloc away? Well malloc is expensive relative
- * to what we are measuring, so cannot have any malloc's (or worse
- * callocs) in our measurement code paths. Instead, we are going to
- * pre-allocate a circular buffer and collect a maximum number of samples.
- * Prior to dumping them all we'll create a new buffer and swap the
- * old buffer with the new, and then proceed to dump the statistics
- * in our dump thread.
- *
- */
-ios_sample_buf_t *
-ios_create_sample_buf (size_t buf_size)
-{
- ios_sample_buf_t *ios_sample_buf = NULL;
- ios_sample_t *ios_samples = NULL;
-
- ios_sample_buf = GF_CALLOC (1,
- sizeof (*ios_sample_buf),
- gf_io_stats_mt_ios_sample_buf);
- if (!ios_sample_buf)
- goto err;
-
- ios_samples = GF_CALLOC (buf_size,
- sizeof (*ios_samples),
- gf_io_stats_mt_ios_sample);
-
- if (!ios_samples)
- goto err;
-
- ios_sample_buf->ios_samples = ios_samples;
- ios_sample_buf->size = buf_size;
- ios_sample_buf->pos = 0;
- ios_sample_buf->observed = 0;
- ios_sample_buf->collected = 0;
-
- return ios_sample_buf;
-err:
- GF_FREE (ios_sample_buf);
- return NULL;
-}
-
-void
-ios_destroy_sample_buf (ios_sample_buf_t *ios_sample_buf)
-{
- GF_FREE (ios_sample_buf->ios_samples);
- GF_FREE (ios_sample_buf);
-}
-
-static int
-ios_init_sample_buf (struct ios_conf *conf)
-{
- int32_t ret = -1;
-
- GF_ASSERT (conf);
- LOCK (&conf->lock);
- conf->ios_sample_buf = ios_create_sample_buf (
- conf->ios_sample_buf_size);
- if (!conf->ios_sample_buf)
- goto out;
- ret = 0;
-out:
- UNLOCK (&conf->lock);
- return ret;
-}
-
int
ios_stat_add_to_list (struct ios_stat_head *list_head, uint64_t value,
struct ios_stat *iosstat)
@@ -843,13 +654,11 @@ ios_stat_add_to_list (struct ios_stat_head *list_head, uint64_t value,
new->value = value;
ios_stat_ref (iosstat);
list_add_tail (&new->list, &tmp->list);
- if (last) {
- stat = last->iosstat;
- last->iosstat = NULL;
- ios_stat_unref (stat);
- list_del (&last->list);
- GF_FREE (last);
- }
+ stat = last->iosstat;
+ last->iosstat = NULL;
+ ios_stat_unref (stat);
+ list_del (&last->list);
+ GF_FREE (last);
if (reposition == MAX_LIST_MEMBERS)
list_head->min_cnt = value;
else if (min_count) {
@@ -876,7 +685,7 @@ out:
return 0;
}
-static int
+static inline int
ios_stats_cleanup (xlator_t *this, inode_t *inode)
{
@@ -918,27 +727,24 @@ io_stats_log_px_stat (xlator_t *this, ios_sample_buf_t *sample_buf,
int i = 0;
int px = 0;
struct ios_conf *conf = NULL;
-
+
collected = sample_buf->collected;
-
+
px = (int)(pval * 100);
pn_idx = (int)(pval * collected);
pn_val = global_ios_latency[pn_idx];
- ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc,
- "global", px, pn_val);
+ ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, "global", px, pn_val);
for (i = 0; i < GF_FOP_MAXVALUE; i++) {
qsort (ios_latencies[i], num_fop[i], sizeof (double),
gf_compare_double);
pn_idx = (int)(pval*num_fop[i]);
pn_val = ios_latencies[i][pn_idx];
- ios_log (this, logfp,
- "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, gf_fop_list[i], px, pn_val);
+ ios_log (this, logfp, "\"%s.%s.p%d_latency_usec\":\"%0.4lf\",", desc, gf_fop_list[i], px, pn_val);
}
}
-
double
io_stats_prepare_latency_samples (ios_sample_buf_t *sample_buf, int *num_fop,
double **ios_latencies,
@@ -970,7 +776,8 @@ io_stats_prepare_latency_samples (ios_sample_buf_t *sample_buf, int *num_fop,
double
io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf,
- FILE* logfp, char * key_prefix, char * str_prefix)
+ FILE* logfp, char * key_prefix,
+ char * str_prefix)
{
double *ios_latencies[GF_FOP_MAXVALUE] = {NULL, };
double *global_ios_latency = NULL;
@@ -991,8 +798,6 @@ io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf,
}
for (i = 0; i < GF_FOP_MAXVALUE; i++) {
- qsort (ios_latencies[i], num_fop[i], sizeof (double),
- gf_compare_double);
ios_latencies[i] = GF_CALLOC (collected,
sizeof (double),
0);
@@ -1028,7 +833,9 @@ io_stats_dump_pn_latencies (xlator_t *this, ios_sample_buf_t *sample_buf,
num_fop, ios_latencies, global_ios_latency);
out:
+
GF_FREE (prefix);
+
GF_FREE (global_ios_latency);
for (j = 0; j < i; j++) {
GF_FREE (ios_latencies[j]);
@@ -1038,8 +845,7 @@ out:
}
int
-ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this,
- FILE *logfp)
+ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this, FILE* logfp)
{
struct ios_stat_list *entry = NULL;
@@ -1056,7 +862,7 @@ ios_dump_file_stats (struct ios_stat_head *list_head, xlator_t *this,
int
ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this,
- FILE *logfp, ios_stats_thru_t type)
+ FILE* logfp, ios_stats_type_t type)
{
struct ios_stat_list *entry = NULL;
struct timeval time = {0, };
@@ -1081,6 +887,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this,
int
_io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
+ struct ios_conf *conf = NULL;
char *key_root = "storage.gluster";
char *xlator_name = NULL;
char *instance_name = NULL;
@@ -1088,9 +895,10 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
int bytes_written = 0;
int i = 0;
int ret = 0;
- struct ios_conf *conf = this->private;
- xlator_name = strdupa (this->name);
+ conf = this->private;
+ xlator_name = strdupa (get_top_xlator_name (this));
+
for (i = 0; i < strlen (xlator_name); i++) {
if (xlator_name[i] == '/')
xlator_name[i] = '_';
@@ -1100,15 +908,14 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
if (conf->iamshd) {
xlator_name = "shd";
} else if (conf->iamnfsd) {
+ instance_name = xlator_name;
xlator_name = "nfsd";
- instance_name = strdupa (this->name);
} else if (conf->iamgfproxyd) {
+ instance_name = xlator_name;
xlator_name = "gfproxyd";
- instance_name = strdupa (this->name);
- }
-
- if (strcmp (__progname, "glusterfsd") == 0)
+ } else if (conf->iambrickd) {
key_root = "storage.gluster.brick";
+ }
if (instance_name) {
/* +3 for 2 x "." + NULL */
@@ -1221,8 +1028,9 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
ios_log (this, logfp, "\"%s.%s.inodelk_count\": \"%u\",",
key_prefix, str_prefix, inodelk_count);
- ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc, &xattr,
- GLUSTERFS_ENTRYLK_COUNT, NULL, NULL);
+ ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc,
+ &xattr, GLUSTERFS_ENTRYLK_COUNT,
+ NULL, NULL);
if (ret == 0 && xattr) {
if (dict_get_int32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
&entrylk_count) != 0) {
@@ -1319,12 +1127,11 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
fop_lat_max = stats->latency[i].max;
}
}
- if (interval == -1) {
- ios_log (this, logfp,
- "\"%s.%s.fop.%s.count\": \"%"PRId64"\",",
- key_prefix, str_prefix, lc_fop_name,
- fop_hits);
- } else {
+ ios_log (this, logfp,
+ "\"%s.%s.fop.%s.count\": \"%"PRId64"\",",
+ key_prefix, str_prefix, lc_fop_name,
+ fop_hits);
+ if (interval != -1) {
ios_log (this, logfp,
"\"%s.%s.fop.%s.per_sec\": \"%0.2lf\",",
key_prefix, str_prefix, lc_fop_name,
@@ -1355,6 +1162,7 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
"\"%s.%s.fop.weighted_latency_ave_usec_nozerofill\": \"%0.4lf\",",
key_prefix, str_prefix, weighted_fop_ave_usec);
}
+
ios_log (this, logfp,
"\"%s.%s.fop.weighted_latency_ave_usec\": \"%0.4lf\",",
key_prefix, str_prefix, weighted_fop_ave_usec);
@@ -1371,8 +1179,8 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
"\"%s.%s.fop.GOT1\":\"%0.4lf\",",
key_prefix, str_prefix, fop_ave_usec);
if (conf->dump_percentile_latencies) {
- io_stats_dump_pn_latencies (this, sample_buf, logfp,
- key_prefix, str_prefix);
+ io_stats_dump_pn_latencies (this, sample_buf, logfp, key_prefix,
+ str_prefix);
ios_log (this, logfp,
"\"%s.%s.fop.GOT2\":\"%0.4lf\",",
key_prefix, str_prefix, fop_ave_usec);
@@ -1381,29 +1189,32 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
"\"%s.%s.fop.GOT3\":\"%0.4lf\",",
key_prefix, str_prefix, fop_ave_usec);
- if (conf->iamnfsd) {
+ if (conf->iamnfsd || conf->iambrickd) {
dict_t *xattr = NULL;
- ret = syncop_getxattr (this, &unused_loc, &xattr,
+ ret = syncop_getxattr (FIRST_CHILD (this), &unused_loc, &xattr,
IO_THREADS_QUEUE_SIZE_KEY, NULL, NULL);
if (xattr) {
- // Iterate over the dictionary returned to us by io-threads and
- // dump the results to the stats file.
+ /*
+ * Iterate over the dictionary returned to us by
+ * io-threads and dump the results to the stats file.
+ */
data_pair_t *curr = NULL;
- dict_for_each (xattr, curr) {
+ dict_foreach_inline (xattr, curr) {
ios_log (this, logfp,
- "\"%s.%s.%s.queue_size\": \"%d\",",
- key_prefix, str_prefix, curr->key,
- data_to_int32 (curr->value));
+ "\"%s.%s.%s.queue_size\": \"%d\",",
+ key_prefix, str_prefix, curr->key,
+ data_to_int32 (curr->value));
}
- // Free the dictionary
+ /* Free the dictionary! */
dict_unref (xattr);
} else {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to get queue size counts from "
- "the io-threads translator!");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get queue size counts "
+ "from the io-threads translator!");
}
}
+
if (interval == -1) {
ios_log (this, logfp, "\"%s.%s.uptime\": \"%"PRId64"\",",
key_prefix, str_prefix,
@@ -1427,7 +1238,19 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
(double)(stats->data_written / interval_sec));
}
+#if OFFSET_WRITE_DETECTOR
+ if (conf->iamnfsd) {
+ ios_log (this, logfp,
+ ",\"%s.%s.offset_writes\": \"%"PRId64"\",",
+ key_prefix, str_prefix, stats->offset_write_ops);
+ ios_log (this, logfp,
+ "\"%s.%s.total_writes\": \"%"PRId64"\"",
+ key_prefix, str_prefix, stats->total_write_ops);
+ }
+#endif
+
ios_log (this, logfp, "}");
+
ret = 0;
out:
GF_FREE (key_prefix);
@@ -1443,14 +1266,14 @@ _resolve_username (xlator_t *this, uid_t uid)
char *pwd_buf = NULL;
char *ret = NULL;
- /* Prepare our buffer for the uid->username translation */
+ // Prepare our buffer for the uid->username translation
#ifdef _SC_GETGR_R_SIZE_MAX
pwd_buf_len = sysconf (_SC_GETGR_R_SIZE_MAX);
#else
pwd_buf_len = -1;
#endif
if (pwd_buf_len == -1) {
- pwd_buf_len = DEFAULT_PWD_BUF_SZ; /* per the man page */
+ pwd_buf_len = DEFAULT_PWD_BUF_SZ; // per the man page
}
pwd_buf = alloca (pwd_buf_len);
@@ -1482,14 +1305,14 @@ _resolve_group_name (xlator_t *this, gid_t gid)
char *grp_buf = NULL;
char *ret = NULL;
- /* Prepare our buffer for the gid->group name translation */
+ // Prepare our buffer for the gid->group name translation
#ifdef _SC_GETGR_R_SIZE_MAX
grp_buf_len = sysconf (_SC_GETGR_R_SIZE_MAX);
#else
grp_buf_len = -1;
#endif
if (grp_buf_len == -1) {
- grp_buf_len = DEFAULT_GRP_BUF_SZ; /* per the man page */
+ grp_buf_len = DEFAULT_GRP_BUF_SZ; // per the man page
}
grp_buf = alloca (grp_buf_len);
@@ -1520,7 +1343,7 @@ err:
*/
void
_io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
- FILE *logfp)
+ FILE* logfp)
{
double epoch_time = 0.00;
char *xlator_name = NULL;
@@ -1531,8 +1354,12 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
char *port_pos = NULL;
char *group_name = NULL;
char *username = NULL;
- char *path = NULL;
struct ios_conf *conf = NULL;
+ const char *pri_str = NULL;
+ const char *ns_name = NULL;
+ const char *path = NULL;
+ const char *name = NULL;
+ loc_t *loc = NULL;
const char *error_string = NULL;
int32_t op_errno = 0;
@@ -1541,7 +1368,7 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
epoch_time = (sample->timestamp).tv_sec +
((sample->timestamp).tv_usec / 1000000.0);
- if (strlen (sample->identifier) == 0) {
+ if (!sample->identifier || (strlen (sample->identifier) == 0)) {
hostname = "Unknown";
port = "Unknown";
} else {
@@ -1553,16 +1380,12 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
if (!port)
goto err;
*port_pos = '\0';
- hostname = gf_rev_dns_lookup_cached (identifier,
- conf->dnscache);
+ hostname = gf_rev_dns_lookup (identifier);
if (!hostname)
- hostname = "Unknown";
+ hostname ="Unknown";
}
- xlator_name = this->name;
- if (!xlator_name || strlen (xlator_name) == 0)
- xlator_name = "Unknown";
-
+ xlator_name = strdupa (get_top_xlator_name (this));
instance_name = this->instance_name;
if (!instance_name || strlen (instance_name) == 0)
instance_name = "N/A";
@@ -1581,6 +1404,24 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
sprintf (group_name, "%d", (int32_t)sample->gid);
}
+ /*
+ * If the priority was unspecified, print out actual priority of the FOP,
+ * otherwise print the priority given on the frame.
+ */
+ if (sample->fop_pri != GF_FOP_PRI_UNSPEC) {
+ pri_str = fop_pri_to_string (sample->fop_pri);
+ } else {
+ pri_str = fop_enum_to_pri_string (sample->fop_type);
+ }
+
+ ns_name = get_ns_from_hash (conf, &sample->ns_info);
+ if (!ns_name) {
+ ns_name = "Unknown";
+ } else if (strcmp (ns_name, "/") == 0) {
+ /* Give the root a readable name in the samples. */
+ ns_name = "Root";
+ }
+
path = "Unknown";
if (sample->have_path)
path = sample->path;
@@ -1592,11 +1433,11 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
}
ios_log (this, logfp,
- "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s",
- epoch_time, fop_enum_to_pri_string (sample->fop_type),
+ "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%s,%d,%s",
+ epoch_time, pri_str,
fop_enum_to_string (sample->fop_type),
sample->elapsed, xlator_name, instance_name, username,
- group_name, hostname, port, path, op_errno, error_string);
+ group_name, hostname, port, ns_name, path, op_errno, error_string);
goto out;
err:
gf_log (this->name, GF_LOG_ERROR,
@@ -1606,6 +1447,426 @@ out:
GF_FREE (username);
}
+/**
+ * get_ns_from_hash
+ *
+ * Given a hash (produced by the namespace hashing xlator), retrieve a string
+ * that can be used as a key for a few other dictionaries to retrieve crucial
+ * info for NS throttling.
+ *
+ * @conf: Translator configuration struct.
+ * @info: Namespace hash information.
+ */
+const char *
+get_ns_from_hash (struct ios_conf *conf, const ns_info_t *info)
+{
+ char ns_key[DICT_UINT32_KEY_SIZE];
+ char *ns_name;
+ int ret;
+
+ /* Handle the special case hashes. */
+ if (info->found == _gf_false) {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG, "NS > H: NO HASH.");
+ return NULL;
+ }
+
+ dict_uint32_to_key (info->hash, ns_key);
+ ret = dict_get_str (conf->hash_to_ns, ns_key, &ns_name);
+
+ if (ret == 0) {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG,
+ "NS > H: %u -> %s.",
+ info->hash, ns_name);
+ return ns_name;
+ } else {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG,
+ "NS > H: HASH %u NOT FOUND.",
+ info->hash);
+ return NULL;
+ }
+}
+
+/**
+ * get_fop_hits_for_ns
+ *
+ * Given a namespace, returns the ios_fop_hits_t structure tracking
+ * fop averages for this namespace.
+ *
+ * @this: Pointer to the io-stats xlator.
+ * @ns_name: Namespace we want to find the fop_hits structure for.
+ */
+ios_fop_hits_t *
+get_fop_hits_for_ns (const xlator_t *this, const char *ns_name)
+{
+ struct ios_conf *conf = this->private;
+ data_t *fop_hits_dict_entry_data;
+ ios_fop_hits_t *fop_hits;
+
+ fop_hits_dict_entry_data = dict_get (conf->fop_hits_dict, (char *)ns_name);
+
+ if (fop_hits_dict_entry_data) {
+ fop_hits = (ios_fop_hits_t *)fop_hits_dict_entry_data->data;
+
+ if (fop_hits) {
+ return fop_hits;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * get_fop_rate_for_ns
+ *
+ * Returns the currently computed fop rate for the given namespace.
+ *
+ * @this: Pointer to the io-stats xlator.
+ * @ns_name: Namespace we want to find the fop rate for.
+ */
+double
+get_fop_rate_for_ns (const xlator_t *this, const char *ns_name)
+{
+ ios_fop_hits_t *fop_hits = get_fop_hits_for_ns (this, ns_name);
+ return fop_hits ? fop_hits->avg_fops : -1;
+}
+
+int64_t get_allowed_fop_rate_for_ns (struct ios_conf *conf, const char *ns_name)
+{
+ int64_t rate;
+
+ if (dict_get_int64 (conf->throttling_rates,
+ (char *)ns_name, &rate) == 0) {
+ return rate;
+ } else {
+ return -1;
+ }
+}
+
+gf_boolean_t
+should_throttle_fop (call_frame_t *frame, xlator_t *this)
+{
+ struct ios_conf *conf = this->private;
+ const char *ns_name = NULL;
+ double fop_rate = 0;
+ int64_t allowed_fop_rate = 0;
+ int ret = 0;
+
+ /*
+ * If throttling is not enabled, of course, do not throttle.
+ */
+ if (!conf->throttling_enabled) {
+ goto no_throttle;
+ }
+
+ /*
+ * If no throttling rates are defined, default to no throttling.
+ */
+ if (!conf->throttling_rates) {
+ goto no_throttle;
+ }
+
+ /*
+ * Attempt to find the namespace directory name from the id, and if
+ * cannot find it, default to no throttling.
+ */
+ ns_name = get_ns_from_hash (conf, &frame->root->ns_info);
+ if (!ns_name) {
+ goto no_throttle;
+ }
+
+ /*
+ * Attempt to find the current fop rate for the directory.
+ * If for some reason we cannot find it, default to no throttling.
+ */
+ fop_rate = get_fop_rate_for_ns (this, ns_name);
+
+ if (fop_rate < 0) {
+ goto no_throttle;
+ }
+
+ allowed_fop_rate = get_allowed_fop_rate_for_ns (conf, ns_name);
+
+ /*
+ * If we can't find an allowed fop rate for the namespace, or if it
+ * is set to (-1), then default to no throttling.
+ */
+ if (allowed_fop_rate < 0) {
+ goto no_throttle;
+ }
+
+ /*
+ * Finally, if the share has exceeded the fop rate,
+ * we should throttle it.
+ */
+ if (fop_rate <= allowed_fop_rate) {
+ goto no_throttle;
+ }
+
+ /* If we are here after all of that, finally throttle the namespace. */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%f > %ld | Throttling FOP to namespace '%s'",
+ fop_rate, allowed_fop_rate, ns_name);
+ return _gf_true;
+
+no_throttle:
+ return _gf_false;
+}
+
+static inline ios_fop_hits_t *ios_fop_hits_create ()
+{
+ return GF_CALLOC (1, sizeof (ios_fop_hits_t), gf_io_stats_mt_ios_sample_buf);
+}
+
+int
+io_stats_process_ios_sample (const xlator_t *this, const ios_sample_t *sample)
+{
+ struct ios_conf *conf = this->private;
+ ios_fop_hits_t *fop_hits = NULL;
+ const char *ns_name = NULL;
+ int ret = 0;
+ data_t *fop_hits_dict_entry_data = NULL;
+
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Silently skip if ns_info isn't initialized yet. Sometimes samples
+ * get dumped before the dict can be enabled, and while it's not bad,
+ * it's quite noisy on the brick logs. */
+ if (!conf->measure_namespace_rates || !conf->fop_hits_dict) {
+ return 0;
+ }
+
+ ns_name = get_ns_from_hash (conf, &sample->ns_info);
+ if (!ns_name) {
+ /* It is still useful to collect statistics on "unknown" samples
+ * so we know how many FOPs are untagged by the namespace xlator. */
+ ns_name = "unknown";
+ }
+
+ fop_hits_dict_entry_data = dict_get (conf->fop_hits_dict,
+ (char *)ns_name);
+
+ if (!fop_hits_dict_entry_data) {
+ /*
+ * If no FOP hits are detected for a namespace, lets create
+ * one and place it in the dictionary
+ */
+ fop_hits = ios_fop_hits_create ();
+ fop_hits_dict_entry_data = bin_to_data (fop_hits,
+ sizeof (*fop_hits));
+ ret = dict_set (conf->fop_hits_dict, (char *)ns_name,
+ fop_hits_dict_entry_data);
+ if (ret != 0) {
+ goto out;
+ }
+ }
+
+ /*
+ * Let's now take the fop_hits and increment the appropriate counter.
+ */
+ fop_hits = (ios_fop_hits_t *)fop_hits_dict_entry_data->data;
+ fop_hits->hits[fop_hits->idx]++;
+ fop_hits->elapsed[fop_hits->idx] += sample->elapsed;
+
+out:
+ return ret;
+}
+
+int
+io_stats_compute_sma_for_ns (struct ios_dump_args *args, const char *ns_name,
+ ios_fop_hits_t *fop_hits, dict_t *dict, uint64_t idx)
+{
+ xlator_t *this = args->this;
+ struct ios_conf *conf = this->private;
+ int ret = -1;
+ int hit_idx, i;
+ int num_samples = 0;
+ double avg_fops = 0;
+ double elapsed = 0;
+ double avg_latency = 0;
+ char *key_prefix = NULL;
+
+ GF_VALIDATE_OR_GOTO (this->name, this, out);
+ GF_VALIDATE_OR_GOTO (this->name, ns_name, out);
+ GF_VALIDATE_OR_GOTO (this->name, fop_hits, out);
+
+ ret = _io_stats_get_key_prefix (this, &key_prefix);
+ if (ret) {
+ goto out;
+ }
+
+ /* For each i'th window before our current sliding index (e.g. the
+ * 0th window, 1st window, 2nd window preceding the current index),
+ * we want to try to sum the counted hits */
+ for (i = 0; i < conf->ns_rate_window; i++) {
+ /* This gets the window at `idx - i` and wraps correctly around
+ * the FOP_HITS_SIZE-sized array. */
+ hit_idx = (fop_hits->idx - i + FOP_HITS_SIZE) % FOP_HITS_SIZE;
+ gf_log (this->name, GF_LOG_DEBUG, "%s.hits[%d] = %lu * %d",
+ ns_name, hit_idx, fop_hits->hits[hit_idx],
+ conf->ios_sample_interval);
+ gf_log (this->name, GF_LOG_DEBUG, "%s.latency[%d] = %lf",
+ ns_name, hit_idx, fop_hits->elapsed[hit_idx]);
+ /* Accumulate the number of samples that we've seen. */
+ num_samples += fop_hits->hits[hit_idx];
+ /* We add the total elapsed time for all samples in our window,
+ * which we will divide later by the total number of processed
+ * samples. */
+ elapsed += fop_hits->elapsed[hit_idx];
+ }
+
+ /* The number of fops is given by the samples multiplied by the sample
+ * interval, divided by the total length in time of the window. */
+ avg_fops = (double)num_samples * conf->ios_sample_interval /
+ ((double)conf->ns_rate_window * conf->ios_dump_interval);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Average fop rate for %s = %lf / sec", ns_name, avg_fops);
+
+ /* The average latency is given by the total elapsed time for all of
+ * the samples divided by the total number of samples. If we have 0
+ * samples, just return 0. */
+ avg_latency = (num_samples > 0) ? (elapsed / (double)num_samples) : 0;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Average fop latency for %s = %lf usec / fop", ns_name, avg_latency);
+
+ if (strcmp ("/", ns_name) == 0) {
+ /* Give the root namespace a readable name. */
+ ns_name = "root";
+ }
+
+ /* Include a ',' unless we're at the end of the dict. */
+ ios_log (this, args->u.logfp,
+ "\"%s.namespaces.%s.fop_rate\": \"%lf\",",
+ key_prefix, ns_name, avg_fops);
+ ios_log (this, args->u.logfp,
+ "\"%s.namespaces.%s.avg_fop_latency\": \"%lf\"%s",
+ key_prefix, ns_name, avg_latency, (idx == dict->count - 1) ? "" : ",");
+
+ fop_hits->avg_fops = avg_fops;
+ fop_hits->avg_latency = avg_latency;
+ ret = 0;
+out:
+ if (key_prefix) {
+ GF_FREE (key_prefix);
+ }
+
+ return ret;
+}
+
+int
+__io_stats_compute_moving_average_foreach (dict_t *this, char *key,
+ data_t *value, void *data, uint64_t idx)
+{
+ struct ios_dump_args *args = data;
+ xlator_t *x_this = args->this;
+ if (key && value) {
+ gf_log (x_this->name, GF_LOG_TRACE, "Will compute averages for %s", key);
+ ios_fop_hits_t *fop_hits = (ios_fop_hits_t *)value->data;
+ io_stats_compute_sma_for_ns (args, key, fop_hits, this, idx);
+ }
+ return 0;
+}
+
+int
+ios_compute_moving_average (struct ios_dump_args *args)
+{
+ xlator_t *this = args->this;
+ struct ios_conf *conf = this->private;
+
+ if (!conf->measure_namespace_rates) {
+ return 0;
+ }
+
+ if (!conf->fop_hits_dict) {
+ return 0;
+ }
+
+ if (conf->fop_hits_dict->count > 0) {
+ ios_log (args->this, args->u.logfp, "{");
+ gf_log (this->name, GF_LOG_TRACE, __func__);
+ dict_foreach_with_idx (conf->fop_hits_dict, __io_stats_compute_moving_average_foreach, args);
+ ios_log (args->this, args->u.logfp, "}");
+ }
+
+ return 0;
+}
+
+
+int
+io_stats_compute_ema_for_dir (xlator_t *this, const char *dirname,
+ ios_fop_hits_t *fop_hits)
+{
+ return 0;
+}
+
+int
+__io_stats_bump_ctr_foreach (dict_t *this, char *key, data_t *value, void *data)
+{
+ xlator_t *x_this = (xlator_t *)data;
+ ios_fop_hits_t *fop_hits = (ios_fop_hits_t *)value->data;
+
+ /* Increment counter (and wrap), then reset hits for that index. */
+ fop_hits->idx = (fop_hits->idx + 1) % FOP_HITS_SIZE;
+ fop_hits->hits[fop_hits->idx] = 0;
+ fop_hits->elapsed[fop_hits->idx] = 0;
+
+ gf_log (x_this->name, GF_LOG_TRACE, "Moved counters for %s to %ld",
+ key, fop_hits->idx);
+
+ return 0;
+}
+
+void
+io_stats_bump_ctrs (xlator_t *this)
+{
+ struct ios_conf *conf = this->private;
+
+ if (!conf->measure_namespace_rates) {
+ return;
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, __func__);
+ dict_foreach (conf->fop_hits_dict, __io_stats_bump_ctr_foreach,
+ this);
+}
+
+int
+__io_stats_destroy_fop_hits_dict_foreach (dict_t *this, char *key,
+ data_t *value, void *data)
+{
+ if (key && value) {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Will destroy values for %s", key);
+ GF_FREE (value->data);
+ value->data = NULL;
+ }
+ return 0;
+}
+
+static inline void
+io_stats_destroy_fop_hits_dict (dict_t **fop_hits_dict)
+{
+ if (!fop_hits_dict) return;
+
+ /**
+ * Loop through each of the entries in the dict and destroy
+ * allocated values.
+ */
+ dict_foreach (*fop_hits_dict, __io_stats_destroy_fop_hits_dict_foreach, NULL);
+ dict_destroy (*fop_hits_dict);
+ *fop_hits_dict = NULL;
+}
+
+void
+io_stats_dump_foprate_stats (struct ios_dump_args *args)
+{
+ /* Snapshot a moving average for this interval */
+ ios_compute_moving_average (args);
+
+ /* Bump some counters */
+ io_stats_bump_ctrs (args->this);
+}
+
/*
* Takes in our sample buffer and then dumps out the contents
*/
@@ -1618,36 +1879,39 @@ io_stats_dump_latency_samples_logfp (xlator_t *this, FILE* logfp,
/* Empty case, nothing to do, exit. */
if (sample_buf == NULL || sample_buf->collected == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_TRACE,
"No samples, dump not required.");
- ret = 0;
+ ret =0;
goto out;
}
- /* Wrap-around case, dump from pos to sample_buf->size -1
- * and then from 0 to sample_buf->pos (covered off by
- * "simple case")
- */
- if (sample_buf->collected > sample_buf->pos + 1) {
+ /* If we've collected more samples than the buffer can hold, then we
+ * should dump the whole buffer, starting from the oldest entries which
+ * begin at sample_buf->pos. */
+ if (sample_buf->collected > sample_buf->size) {
for (i = sample_buf->pos; i < sample_buf->size; i++) {
- _io_stats_write_latency_sample (this,
- &(sample_buf->ios_samples[i]), logfp);
+ ios_sample_t *sample = &(sample_buf->ios_samples[i]);
+ io_stats_process_ios_sample (this, sample);
+ _io_stats_write_latency_sample (this, sample, logfp);
}
}
- /* Simple case: Dump from 0 to sample_buf->pos */
+ /* Simple case: Dump from 0 to sample_buf->pos. These are the newest
+ * entries if we have wrapped around our buffer, as well. */
for (i = 0; i < sample_buf->pos; i++) {
- _io_stats_write_latency_sample (this,
- &(sample_buf->ios_samples[i]), logfp);
+ ios_sample_t *sample = &(sample_buf->ios_samples[i]);
+ io_stats_process_ios_sample (this, sample);
+ _io_stats_write_latency_sample (this, sample, logfp);
}
out:
return ret;
}
+
int
io_stats_dump_global_to_logfp (xlator_t *this, struct ios_global_stats *stats,
- struct timeval *now, int interval, FILE *logfp)
+ struct timeval *now, int interval, FILE* logfp)
{
int i = 0;
int per_line = 0;
@@ -1789,15 +2053,13 @@ io_stats_dump_global_to_logfp (xlator_t *this, struct ios_global_stats *stats,
ios_log (this, logfp, "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
"\tFILE NAME");
list_head = &conf->thru_list[IOS_STATS_THRU_READ];
- ios_dump_throughput_stats(list_head, this, logfp,
- IOS_STATS_THRU_READ);
+ ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_TYPE_READ);
ios_log (this, logfp, "\n======Write Throughput File Stats======");
ios_log (this, logfp, "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
"\tFILE NAME");
list_head = &conf->thru_list[IOS_STATS_THRU_WRITE];
- ios_dump_throughput_stats (list_head, this, logfp,
- IOS_STATS_THRU_WRITE);
+ ios_dump_throughput_stats (list_head, this, logfp, IOS_STATS_TYPE_WRITE);
}
return 0;
}
@@ -2001,58 +2263,6 @@ ios_global_stats_clear (struct ios_global_stats *stats, struct timeval *now)
stats->started_at = *now;
}
-static int io_stats_dump_quorum (xlator_t *this, struct ios_dump_args *args) {
- FILE *logf = args->u.logfp;
- loc_t root_loc = {0};
- dict_t *dict = NULL;
- xlator_list_t *child = NULL;
- const char *leading_comma = "";
-
- if (args->type != IOS_DUMP_TYPE_JSON_FILE) {
- return -EINVAL;
- }
-
- if (!this->itable->root) {
- return -ENOENT;
- }
-
- // If we don't build a valid 'loc', dht_getxattr swallows our request
- // instead of passing it down to AFR.
- root_loc.path = "/";
- root_loc.name = "";
- root_loc.inode = inode_ref (this->itable->root);
- gf_uuid_copy (root_loc.gfid, root_loc.inode->gfid);
-
- ios_log (this, logf, "{");
-
- for (child = this->children; child; child = child->next) {
- dict = NULL;
-
- syncop_getxattr (child->xlator, &root_loc, &dict,
- GF_AFR_QUORUM_CHECK, NULL, NULL);
-
- if (dict) {
- const data_pair_t *e;
-
- dict_for_each (dict, e) {
- ios_log (this, logf,
- "%s\"storage.gluster.nfsd.%s\": \"%d\"",
- leading_comma,
- e->key, data_to_int32 (e->value));
- leading_comma = ",";
- }
-
- dict_unref (dict);
- }
- }
-
- ios_log (this, logf, "}");
-
- inode_unref (root_loc.inode);
-
- return 0;
-}
-
int
io_stats_dump (xlator_t *this, ios_sample_buf_t *sample_buf,
struct ios_dump_args *args, gf1_cli_info_op op,
@@ -2093,19 +2303,15 @@ io_stats_dump (xlator_t *this, ios_sample_buf_t *sample_buf,
}
UNLOCK (&conf->lock);
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_CUMULATIVE) {
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_CUMULATIVE)
io_stats_dump_global (this, &cumulative, sample_buf, &now,
-1, args);
- }
- if (op == GF_CLI_INFO_ALL || op == GF_CLI_INFO_INCREMENTAL) {
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_INCREMENTAL)
io_stats_dump_global (this, &incremental, sample_buf, &now,
increment, args);
- }
-
- if (conf->iamnfsd) {
- io_stats_dump_quorum (this, args);
- }
return 0;
}
@@ -2176,6 +2382,50 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd)
return 0;
}
+gf_boolean_t _should_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, ios_sample_buf_t* ios_sample_buf,
+ int32_t op_ret, int32_t op_errno)
+{
+ int i;
+
+ /* If sampling is disabled, return false */
+ if (conf->ios_sample_interval == 0)
+ return _gf_false;
+
+ /* Sometimes it's useful to sample errors. If `fop-sample-all-errors`
+ * is active, then we should sample ALL errors. */
+ if (op_ret < 0 && op_errno != 0 && conf->sample_all_errors) {
+ return _gf_true;
+ }
+
+ /* If `fop-sample-hard-errors` is active, we only look through a small
+ * subset of errno values to sample, those which are critical to Gluster
+ * functioning. */
+ if (op_ret < 0 && op_errno != 0 && conf->sample_hard_errors) {
+ for (i = 0; i < IOS_HARD_ERROR_LIST_SIZE; i++) {
+ if (abs (op_errno) == ios_hard_error_list[i]) {
+ return _gf_true;
+ }
+ }
+ }
+
+ /* If auditing is on, sample TRUNCATE, CREATE, UNLINK, RMDIR, MKDIR 1:1 */
+ if (conf->audit_creates_and_unlinks) {
+ switch (fop_type) {
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_CREATE:
+ case GF_FOP_UNLINK:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RMDIR:
+ return _gf_true;
+ default:
+ break;
+ }
+ }
+
+ /* Sample only 1 out of ios_sample_interval number of fops. */
+ return (ios_sample_buf->observed % conf->ios_sample_interval == 0);
+}
+
void ios_local_get_inode (struct ios_local *local, inode_t **inode)
{
if (!local)
@@ -2215,8 +2465,7 @@ void ios_local_get_path (call_frame_t *frame, const char **path)
*/
ios_inode_ctx_get (inode, frame->this, &iosstat);
if (iosstat) {
- gf_log ("io-stats", GF_LOG_DEBUG,
- "[%s] Getting path from iostat struct",
+ gf_log ("io-stats", GF_LOG_DEBUG, "[%s] Getting path from iostat struct",
fop_enum_to_string (frame->op));
*path = iosstat->filename;
goto out;
@@ -2228,8 +2477,7 @@ void ios_local_get_path (call_frame_t *frame, const char **path)
* inside the local.
*/
if (local->loc.path) {
- gf_log ("io-stats", GF_LOG_DEBUG,
- "[%s] Getting path from loc_t",
+ gf_log ("io-stats", GF_LOG_DEBUG, "[%s] Getting path from loc_t",
fop_enum_to_string (frame->op));
*path = local->loc.path;
goto out;
@@ -2239,60 +2487,13 @@ out:
/* If the inode and the loc don't have the path, we're out of luck.
*/
if (!*path) {
- gf_log ("io-stats", GF_LOG_DEBUG,
- "Unable to get path for fop: %s",
+ gf_log ("io-stats", GF_LOG_DEBUG, "Unable to get path for fop: %s",
fop_enum_to_string (frame->op));
}
return;
}
-gf_boolean_t
-_should_sample (struct ios_conf *conf, glusterfs_fop_t fop_type,
- ios_sample_buf_t* ios_sample_buf, int32_t op_ret,
- int32_t op_errno)
-{
- int i;
-
- /* If sampling is disabled, return false */
- if (conf->ios_sample_interval == 0)
- return _gf_false;
-
- /* Sometimes it's useful to sample errors. If `fop-sample-all-errors`
- * is active, then we should sample ALL errors. */
- if (op_ret < 0 && op_errno != 0 && conf->sample_all_errors) {
- return _gf_true;
- }
-
- /* If `fop-sample-hard-errors` is active, we only look through a small
- * subset of errno values to sample, those which are critical to Gluster
- * functioning. */
- if (op_ret < 0 && op_errno != 0 && conf->sample_hard_errors) {
- for (i = 0; i < IOS_HARD_ERROR_LIST_SIZE; i++) {
- if (abs (op_errno) == ios_hard_error_list[i]) {
- return _gf_true;
- }
- }
- }
-
- /* If auditing is on, sample TRUNCATE, CREATE, UNLINK, RMDIR, MKDIR 1:1 */
- if (conf->audit_creates_and_unlinks) {
- switch (fop_type) {
- case GF_FOP_TRUNCATE:
- case GF_FOP_CREATE:
- case GF_FOP_UNLINK:
- case GF_FOP_MKDIR:
- case GF_FOP_RMDIR:
- return _gf_true;
- default:
- break;
- }
- }
-
- /* Sample only 1 out of ios_sample_interval number of fops. */
- return (ios_sample_buf->observed % conf->ios_sample_interval == 0);
-}
-
void collect_ios_latency_sample (struct ios_conf *conf,
glusterfs_fop_t fop_type, double elapsed,
call_frame_t *frame, int32_t op_ret, int32_t op_errno)
@@ -2304,12 +2505,11 @@ void collect_ios_latency_sample (struct ios_conf *conf,
call_stack_t *root = NULL;
const char *path = NULL;
-
ios_sample_buf = conf->ios_sample_buf;
LOCK (&conf->ios_sampling_lock);
- if (!_should_sample (conf, fop_type, ios_sample_buf, op_ret, op_errno)) {
+
+ if (!_should_sample (conf, fop_type, ios_sample_buf, op_ret, op_errno))
goto out;
- }
timestamp = &frame->begin;
root = frame->root;
@@ -2317,54 +2517,44 @@ void collect_ios_latency_sample (struct ios_conf *conf,
ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);
ios_sample->elapsed = elapsed;
ios_sample->fop_type = fop_type;
- ios_sample->op_ret = op_ret;
- ios_sample->op_errno = op_errno;
- ios_sample->uid = root->uid;
- ios_sample->gid = root->gid;
- (ios_sample->timestamp).tv_sec = timestamp->tv_sec;
- (ios_sample->timestamp).tv_usec = timestamp->tv_usec;
- memcpy (&ios_sample->identifier, &root->identifier,
- sizeof (root->identifier));
- /* Eventually every FOP will be supported
- * (i.e., the frame->local will be
+ /* Eventually every FOP will be supported (i.e., the frame->local will be
* of type struct ios_local), but for now, this is a safety.
*/
switch (ios_sample->fop_type) {
-
- case GF_FOP_CREATE:
- case GF_FOP_OPEN:
- case GF_FOP_STAT:
- case GF_FOP_FSTAT:
- case GF_FOP_READ:
- case GF_FOP_WRITE:
- case GF_FOP_OPENDIR:
- case GF_FOP_READDIRP:
- case GF_FOP_READDIR:
- case GF_FOP_FLUSH:
- case GF_FOP_ACCESS:
- case GF_FOP_UNLINK:
- case GF_FOP_TRUNCATE:
- case GF_FOP_MKDIR:
- case GF_FOP_RMDIR:
- case GF_FOP_SETATTR:
- case GF_FOP_LOOKUP:
- case GF_FOP_INODELK:
- case GF_FOP_FINODELK:
- case GF_FOP_ENTRYLK:
- case GF_FOP_FXATTROP:
- case GF_FOP_XATTROP:
- case GF_FOP_GETXATTR:
- case GF_FOP_FGETXATTR:
- case GF_FOP_SETXATTR:
- case GF_FOP_FSETXATTR:
- case GF_FOP_STATFS:
- case GF_FOP_FSYNC:
- ios_local_get_path (frame, &path);
- break;
- default:
- path = NULL;
- break;
+ case GF_FOP_CREATE:
+ case GF_FOP_OPEN:
+ case GF_FOP_STAT:
+ case GF_FOP_FSTAT:
+ case GF_FOP_READ:
+ case GF_FOP_WRITE:
+ case GF_FOP_OPENDIR:
+ case GF_FOP_READDIRP:
+ case GF_FOP_READDIR:
+ case GF_FOP_FLUSH:
+ case GF_FOP_ACCESS:
+ case GF_FOP_UNLINK:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RMDIR:
+ case GF_FOP_SETATTR:
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FXATTROP:
+ case GF_FOP_XATTROP:
+ case GF_FOP_GETXATTR:
+ case GF_FOP_FGETXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_STATFS:
+ case GF_FOP_FSYNC:
+ ios_local_get_path (frame, &path);
+ break;
+ default:
+ path = NULL;
+ break;
}
if (path) {
@@ -2372,12 +2562,19 @@ void collect_ios_latency_sample (struct ios_conf *conf,
ios_sample->have_path = _gf_true;
}
- /* We've reached the end of the circular buffer, start from the
- * beginning. */
- if (ios_sample_buf->pos == (ios_sample_buf->size - 1))
- ios_sample_buf->pos = 0;
- else
- ios_sample_buf->pos++;
+ ios_sample->op_ret = op_ret;
+ ios_sample->op_errno = op_errno;
+ ios_sample->fop_pri = frame->pri;
+ ios_sample->uid = root->uid;
+ ios_sample->gid = root->gid;
+ ios_sample->ns_info = frame->root->ns_info;
+ (ios_sample->timestamp).tv_sec = timestamp->tv_sec;
+ (ios_sample->timestamp).tv_usec = timestamp->tv_usec;
+ memcpy (&ios_sample->identifier, &root->identifier, sizeof (root->identifier));
+
+ /* Increment, making sure that if we've reached the end of the circular
+ * buffer, then start from the beginning (modulo). */
+ ios_sample_buf->pos = (ios_sample_buf->pos + 1) % ios_sample_buf->size;
ios_sample_buf->collected++;
out:
ios_sample_buf->observed++;
@@ -2547,8 +2744,7 @@ unlock_list_head:
}
static int
-attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path,
- const uuid_t gfid) {
+attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, const uuid_t gfid) {
struct ios_stat *iosstat = NULL;
if (!inode) {
@@ -2557,21 +2753,22 @@ attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path,
ios_inode_ctx_get (inode, this, &iosstat);
if (!iosstat) {
- iosstat = GF_CALLOC (1, sizeof (*iosstat),
- gf_io_stats_mt_ios_stat);
+ iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat);
if (!iosstat) {
return -ENOMEM;
}
iosstat->filename = gf_strdup (path);
gf_uuid_copy (iosstat->gfid, gfid);
LOCK_INIT (&iosstat->lock);
+#if OFFSET_WRITE_DETECTOR
+ iosstat->expected_write_offset = UNKNOWN_WRITE_OFFSET;
+#endif
ios_inode_ctx_set (inode, this, iosstat);
}
return 0;
}
-
int
ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd)
{
@@ -2613,7 +2810,6 @@ free_and_out:
return ret;
}
-
int
io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd,
@@ -2650,12 +2846,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&conf->lock);
- attach_iosstat_to_inode (this, local->loc.inode, local->loc.path,
- buf->ia_gfid);
-
+ attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, buf->ia_gfid);
unwind:
UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno);
+
ios_free_local (frame);
+
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2681,6 +2877,7 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
conf = this->private;
+
ios_build_fd (this, local->loc.path, fd, &iosfd);
if (!iosfd) {
goto unwind;
@@ -2701,16 +2898,15 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN);
}
- attach_iosstat_to_inode (this, local->loc.inode,
- local->loc.path,
- local->loc.inode->gfid);
-
+ attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, local->loc.inode->gfid);
unwind:
UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno);
+
ios_free_local (frame);
+
STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
- return 0;
+ return 0;
}
@@ -2719,7 +2915,9 @@ io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno);
+
ios_free_local (frame);
+
STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2750,7 +2948,6 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (iosstat) {
BUMP_STATS (iosstat, IOS_STATS_TYPE_READ);
BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ);
-
}
unwind:
@@ -2778,39 +2975,34 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno);
ios_inode_ctx_get (local->inode, this, &iosstat);
-
if (iosstat) {
BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE);
BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE);
}
+
unwind:
ios_free_local (frame);
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
return 0;
-
}
-
-
-
int
io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)
{
+ struct ios_local *local = NULL;
struct ios_stat *iosstat = NULL;
- inode_t *inode = frame->local;
- frame->local = NULL;
+ local = frame->local;
UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno);
- ios_inode_ctx_get (inode, this, &iosstat);
-
+ ios_inode_ctx_get (local->inode, this, &iosstat);
if (iosstat) {
- BUMP_STATS (iosstat, IOS_STATS_TYPE_READDIRP);
- iosstat = NULL;
+ BUMP_STATS (iosstat, IOS_STATS_TYPE_READDIRP);
}
+ ios_free_local (frame);
STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2829,7 +3021,6 @@ io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
ios_free_local (frame);
- UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);
STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2842,8 +3033,7 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno);
ios_free_local (frame);
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
return 0;
}
@@ -2894,7 +3084,6 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *sbuf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
return 0;
}
@@ -2907,11 +3096,10 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata, struct iatt *postparent)
{
struct ios_local *local = frame->local;
-
if (local && local->loc.path && inode && op_ret >= 0) {
- attach_iosstat_to_inode (this, inode, local->loc.path,
- inode->gfid);
+ attach_iosstat_to_inode (this, inode, local->loc.path, inode->gfid);
}
+
UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno);
ios_free_local (frame);
STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,
@@ -2956,9 +3144,11 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct ios_local *local = frame->local;
if (local && local->loc.path) {
- local->inode = inode_ref (inode);
- attach_iosstat_to_inode (this, inode, local->loc.path,
- buf->ia_gfid);
+ local->inode = inode_ref (inode); /* Just for consistency's
+ * sake.
+ */
+
+ attach_iosstat_to_inode (this, inode, local->loc.path, buf->ia_gfid);
}
UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno);
@@ -3008,14 +3198,12 @@ io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret < 0)
goto unwind;
- attach_iosstat_to_inode (this, local->inode, local->loc.path,
- local->inode->gfid);
+ attach_iosstat_to_inode (this, local->inode, local->loc.path, local->inode->gfid);
ios_fd_ctx_set (local->fd, this, 0);
ios_inode_ctx_get (local->fd->inode, this, &iosstat);
if (iosstat)
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR);
-
unwind:
UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno);
ios_free_local (frame);
@@ -3029,7 +3217,6 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
-
UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno);
ios_free_local (frame);
STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
@@ -3121,7 +3308,6 @@ io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -3132,7 +3318,6 @@ io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -3148,15 +3333,14 @@ io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* in NFS. We need to make sure that we are attaching the
* data correctly to this inode.
*/
- if (local->loc.inode && local->loc.path) {
- attach_iosstat_to_inode (this, local->loc.inode,
- local->loc.path,
- local->loc.inode->gfid);
+ if (local && local->loc.inode && local->loc.path) {
+ attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, local->loc.inode->gfid);
}
UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno);
ios_free_local (frame);
STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
+
return 0;
}
@@ -3167,7 +3351,6 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
prebuf, postbuf, xdata);
return 0;
@@ -3191,8 +3374,7 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno);
- ios_free_local (frame);
- STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -3204,8 +3386,7 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno);
- ios_free_local (frame);
- STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -3216,7 +3397,6 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
@@ -3227,7 +3407,6 @@ io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno);
- ios_free_local (frame);
STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);
return 0;
}
@@ -3285,6 +3464,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this,
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_entrylk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->entrylk,
@@ -3301,6 +3482,8 @@ io_stats_inodelk (call_frame_t *frame, xlator_t *this,
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_inodelk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->inodelk,
@@ -3325,8 +3508,11 @@ io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_finodelk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->finodelk,
@@ -3340,8 +3526,11 @@ io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_xattrop_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->xattrop,
@@ -3355,8 +3544,11 @@ io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_fxattrop_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fxattrop,
@@ -3370,8 +3562,11 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_lookup_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->lookup,
@@ -3384,8 +3579,11 @@ int
io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_stat_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->stat,
@@ -3398,7 +3596,6 @@ int
io_stats_readlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, size_t size, dict_t *xdata)
{
- ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_readlink_cbk,
@@ -3413,9 +3610,10 @@ int
io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
- ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_mknod_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->mknod,
@@ -3429,8 +3627,11 @@ io_stats_mkdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_mkdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->mkdir,
@@ -3444,9 +3645,12 @@ io_stats_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
- STACK_WIND (frame, io_stats_unlink_cbk,
+ FOP_THROTTLE (frame, this);
+
+ STACK_WIND_COOKIE (frame, io_stats_unlink_cbk, loc,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->unlink,
loc, xflag, xdata);
@@ -3459,9 +3663,12 @@ io_stats_rmdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, int flags, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
- STACK_WIND (frame, io_stats_rmdir_cbk,
+ FOP_THROTTLE (frame, this);
+
+ STACK_WIND_COOKIE (frame, io_stats_rmdir_cbk, loc,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->rmdir,
loc, flags, xdata);
@@ -3475,6 +3682,8 @@ io_stats_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
{
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_symlink_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->symlink,
@@ -3489,6 +3698,8 @@ io_stats_rename (call_frame_t *frame, xlator_t *this,
{
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_rename_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->rename,
@@ -3503,6 +3714,8 @@ io_stats_link (call_frame_t *frame, xlator_t *this,
{
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_link_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->link,
@@ -3516,8 +3729,11 @@ io_stats_setattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_setattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->setattr,
@@ -3531,8 +3747,11 @@ io_stats_truncate (call_frame_t *frame, xlator_t *this,
loc_t *loc, off_t offset, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_truncate_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate,
@@ -3550,6 +3769,8 @@ io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_open_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->open,
@@ -3562,13 +3783,14 @@ int
io_stats_create (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, mode_t mode,
mode_t umask, fd_t *fd, dict_t *xdata)
-
{
ios_track_loc (frame, loc);
ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_create_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->create,
@@ -3582,8 +3804,11 @@ io_stats_readv (call_frame_t *frame, xlator_t *this,
fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_readv_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readv,
@@ -3607,13 +3832,36 @@ io_stats_writev (call_frame_t *frame, xlator_t *this,
len = iov_length (vector, count);
+#if OFFSET_WRITE_DETECTOR
+ if (conf->iamnfsd && fd->inode) {
+ struct ios_stat *iosstat = NULL;
+
+ ios_inode_ctx_get (fd->inode, this, &iosstat);
+ if (iosstat) {
+ int is_offset = iosstat->expected_write_offset != UNKNOWN_WRITE_OFFSET &&
+ iosstat->expected_write_offset != offset;
+
+ if (is_offset) {
+ ++conf->cumulative.offset_write_ops;
+ ++conf->incremental.offset_write_ops;
+ }
+ ++conf->cumulative.total_write_ops;
+ ++conf->incremental.total_write_ops;
+ iosstat->expected_write_offset = offset + len;
+ }
+ }
+#endif
+
BUMP_WRITE (fd, len);
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_writev_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->writev,
fd, vector, count, offset, flags, iobref, xdata);
+
return 0;
}
@@ -3624,6 +3872,7 @@ io_stats_statfs (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_statfs_cbk,
@@ -3639,8 +3888,11 @@ io_stats_flush (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_flush_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->flush,
@@ -3654,8 +3906,11 @@ io_stats_fsync (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t flags, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_fsync_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsync,
@@ -3686,37 +3941,44 @@ conditional_dump (dict_t *dict, char *key, data_t *value, void *data)
memset (filename, 0, value->len + 1);
memcpy (filename, data_to_str (value), value->len);
- pid = getpid ();
+ if (fnmatch ("*io*stat*dump", key, 0) == 0) {
+ pid = getpid ();
- if (!strncmp (filename, "", 1)) {
- gf_log (this->name, GF_LOG_ERROR, "No filename given");
- return -1;
- }
- logfp = fopen (filename, "w+");
- if (!logfp) {
- gf_log (this->name, GF_LOG_ERROR, "failed to open %s "
+
+ if (!strncmp (filename, "", 1)) {
+ gf_log (this->name, GF_LOG_ERROR, "No filename given");
+ return -1;
+ }
+ logfp = fopen (filename, "w+");
+ if (!logfp) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open %s "
"for writing", filename);
- return -1;
- }
- sprintf (dump_key, "*io*stat*%d_json_dump", pid);
- if (fnmatch (dump_key, key, 0) == 0) {
- (void) ios_dump_args_init (
- &args, IOS_DUMP_TYPE_JSON_FILE,
- logfp);
- } else {
- (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE,
- logfp);
+ return -1;
+ }
+ sprintf (dump_key, "*io*stat*%d_json_dump", pid);
+ if (fnmatch (dump_key, key, 0) == 0) {
+ (void) ios_dump_args_init (
+ &args, IOS_DUMP_TYPE_JSON_FILE,
+ logfp);
+ } else {
+ (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE,
+ logfp);
+ }
+
+ io_stats_dump (this, NULL, &args, GF_CLI_INFO_ALL,
+ _gf_false);
+
+ fclose (logfp);
}
- io_stats_dump (this, NULL, &args, GF_CLI_INFO_ALL, _gf_false);
- fclose (logfp);
+
return 0;
}
int
_ios_destroy_dump_thread (struct ios_conf *conf) {
conf->dump_thread_should_die = _gf_true;
- if (conf->ios_dump_interval > 0) {
- (void) pthread_cancel (conf->dump_thread);
+
+ if (conf->dump_thread) {
(void) pthread_join (conf->dump_thread, NULL);
}
return 0;
@@ -3768,60 +4030,55 @@ _ios_dump_thread (xlator_t *this) {
struct ios_conf *conf = NULL;
FILE *stats_logfp = NULL;
FILE *samples_logfp = NULL;
+ FILE *foprate_logfp = NULL;
struct ios_dump_args args = {0};
ios_sample_buf_t *sample_buf = NULL;
int i;
int stats_bytes_written = 0;
int samples_bytes_written = 0;
+ int foprate_bytes_written = 0;
char stats_filename[PATH_MAX];
char samples_filename[PATH_MAX];
- char *xlator_name;
- char *instance_name;
- gf_boolean_t log_stats_fopen_failure = _gf_true;
- gf_boolean_t log_samples_fopen_failure = _gf_true;
- int old_cancel_type;
- int ret;
+ char foprate_filename[PATH_MAX];
+ char *xlator_name = NULL;
+ char *instance_name = NULL;
+ time_t namespace_conf_mtime;
+ int ret = 0;
conf = this->private;
+ xlator_name = strdupa (get_top_xlator_name (this));
+
gf_log (this->name, GF_LOG_INFO, "IO stats dump thread started, "
"polling IO stats every %d seconds", conf->ios_dump_interval);
- xlator_name = strdupa (this->name);
+
for (i = 0; i < strlen (xlator_name); i++) {
if (xlator_name[i] == '/')
xlator_name[i] = '_';
}
+
instance_name = this->instance_name;
if (conf->iamshd) {
xlator_name = "shd";
} else if (conf->iamnfsd) {
+ instance_name = xlator_name;
xlator_name = "nfsd";
- instance_name = strdupa (this->name);
- } else if (conf->iamgfproxyd == _gf_true) {
+ } else if (conf->iamgfproxyd) {
+ instance_name = xlator_name;
xlator_name = "gfproxyd";
- instance_name = strdupa (this->name);
- }
- if (sys_mkdir (_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
- if (errno != EEXIST) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not create stats-dump directory %s",
- _IOS_DUMP_DIR);
- goto out;
- }
- }
- if (sys_mkdir (_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
- if (errno != EEXIST) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not create stats-sample directory %s",
- _IOS_SAMP_DIR);
- goto out;
- }
}
+
+ mkdir (_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG);
+ mkdir (_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG);
+
if (instance_name) {
stats_bytes_written = snprintf (stats_filename, PATH_MAX,
"%s/%s_%s_%s.dump", _IOS_DUMP_DIR,
__progname, xlator_name, instance_name);
- samples_bytes_written = snprintf (samples_filename, PATH_MAX,
- "%s/%s_%s_%s.samp", _IOS_SAMP_DIR,
+ samples_bytes_written = snprintf (samples_filename,
+ PATH_MAX, "%s/%s_%s_%s.samp", _IOS_SAMP_DIR,
+ __progname, xlator_name, instance_name);
+ foprate_bytes_written = snprintf (foprate_filename,
+ PATH_MAX, "%s/%s_%s_%s_namespace_stats.dump", _IOS_DUMP_DIR,
__progname, xlator_name, instance_name);
} else {
stats_bytes_written = snprintf (stats_filename, PATH_MAX,
@@ -3830,22 +4087,40 @@ _ios_dump_thread (xlator_t *this) {
samples_bytes_written = snprintf (samples_filename, PATH_MAX,
"%s/%s_%s.samp", _IOS_SAMP_DIR, __progname,
xlator_name);
+ foprate_bytes_written = snprintf (foprate_filename, PATH_MAX,
+ "%s/%s_%s_namespace_stats.dump", _IOS_DUMP_DIR, __progname,
+ xlator_name);
}
- if ((stats_bytes_written >= PATH_MAX) ||
- (samples_bytes_written >= PATH_MAX)) {
+ if (stats_bytes_written >= PATH_MAX ||
+ samples_bytes_written >= PATH_MAX) {
gf_log (this->name, GF_LOG_ERROR,
"Invalid path for stats dump (%s) and/or latency "
"samples (%s)", stats_filename, samples_filename);
goto out;
}
- while (1) {
- if (conf->dump_thread_should_die)
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to files: stats - %s, samples - %s, foprate - %s",
+ stats_filename, samples_filename, foprate_filename);
+
+ while (_gf_true) {
+ if (conf->dump_thread_should_die) {
break;
- (void) pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS,
- &old_cancel_type);
+ }
+
sleep (conf->ios_dump_interval);
- (void) pthread_setcanceltype (PTHREAD_CANCEL_DEFERRED,
- &old_cancel_type);
+
+ /* First try and load the throttling conf file */
+ if (conf->measure_namespace_rates) {
+ ret = ios_conf_load_namespace_conf (conf);
+
+ if (ret != 0) {
+ gf_log (GF_IO_STATS, GF_LOG_WARNING,
+ "Failed to re-load & parse \"%s\", error=%s.",
+ _IOS_NAMESPACE_CONF, strerror (ret));
+ }
+ }
+
/* Swap buffers */
ret = _ios_swap_sample_buf (this, &sample_buf);
if (ret != 0) {
@@ -3854,41 +4129,45 @@ _ios_dump_thread (xlator_t *this) {
goto out;
}
- /*
- * It's not clear whether we should reopen this each time, or
- * just hold it open and rewind/truncate on each iteration.
- * Leaving it alone for now.
- */
stats_logfp = fopen (stats_filename, "w+");
+
+ /* Dump statistics */
if (stats_logfp) {
- (void) ios_dump_args_init (&args,
- IOS_DUMP_TYPE_JSON_FILE,
- stats_logfp);
- io_stats_dump (this, sample_buf, &args,
- GF_CLI_INFO_ALL, _gf_false);
+ (void) ios_dump_args_init (
+ &args, IOS_DUMP_TYPE_JSON_FILE,
+ stats_logfp);
+ io_stats_dump (this, sample_buf, &args, GF_CLI_INFO_ALL, _gf_false);
fclose (stats_logfp);
- log_stats_fopen_failure = _gf_true;
- } else if (log_stats_fopen_failure) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not open stats-dump file %s (%s)",
- stats_filename, strerror(errno));
- log_stats_fopen_failure = _gf_false;
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ strerror (errno));
}
+
+ /* Dump latency samples */
samples_logfp = fopen (samples_filename, "a");
if (samples_logfp) {
io_stats_dump_latency_samples_logfp (this,
- samples_logfp,
- sample_buf);
+ samples_logfp, sample_buf);
fclose (samples_logfp);
- log_samples_fopen_failure = _gf_true;
- } else if (log_samples_fopen_failure) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not open samples-dump file %s (%s)",
- samples_filename, strerror(errno));
- log_samples_fopen_failure = _gf_false;
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ strerror (errno));
}
- /* cleanup sample buf */
+ /* Log throttling stats only if we were asked to measure namespace rates */
+ if (conf->measure_namespace_rates) {
+ foprate_logfp = fopen (foprate_filename, "w+");
+ if (foprate_logfp) {
+ (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_JSON_FILE, foprate_logfp);
+ args.this = this;
+ io_stats_dump_foprate_stats (&args);
+ fclose (foprate_logfp);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "%s", strerror (errno));
+ }
+ }
+
+ // cleanup sample buf
ios_destroy_sample_buf (sample_buf);
}
out:
@@ -3896,23 +4175,11 @@ out:
return NULL;
}
-static gf_boolean_t
-match_special_xattr (dict_t *d, char *k, data_t *val, void *mdata)
-{
- gf_boolean_t ret = _gf_false;
- if (fnmatch ("*io*stat*dump", k, 0) == 0) {
- ret = _gf_true;
- }
-
- return ret;
-}
-
int
io_stats_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- int ret = 0;
struct {
xlator_t *this;
inode_t *inode;
@@ -3923,13 +4190,7 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this,
stub.inode = loc->inode;
stub.path = loc->path;
- ret = dict_foreach_match (dict, match_special_xattr, NULL,
- conditional_dump, &stub);
- if (ret > 0) {
- /* Setxattr was on key 'io-stat-dump', hence dump and unwind
- * from here */
- goto out;
- }
+ dict_foreach (dict, conditional_dump, &stub);
ios_track_loc (frame, loc);
@@ -3940,10 +4201,6 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this,
FIRST_CHILD(this)->fops->setxattr,
loc, dict, flags, xdata);
return 0;
-
-out:
- STACK_UNWIND_STRICT (setxattr, frame, 0, 0, NULL);
- return 0;
}
@@ -3952,6 +4209,7 @@ io_stats_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_getxattr_cbk,
@@ -3967,6 +4225,7 @@ io_stats_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_removexattr_cbk,
@@ -3983,6 +4242,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this,
int32_t flags, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fsetxattr_cbk,
@@ -3998,6 +4258,7 @@ io_stats_fgetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fgetxattr_cbk,
@@ -4012,7 +4273,6 @@ int
io_stats_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
- ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fremovexattr_cbk,
@@ -4027,9 +4287,13 @@ int
io_stats_opendir (call_frame_t *frame, xlator_t *this,
loc_t *loc, fd_t *fd, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_opendir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->opendir,
@@ -4041,9 +4305,12 @@ int
io_stats_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
- frame->local = fd->inode;
+ ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_readdirp_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readdirp,
@@ -4056,8 +4323,12 @@ int
io_stats_readdir (call_frame_t *frame, xlator_t *this,
fd_t *fd, size_t size, off_t offset, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_readdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readdir,
@@ -4072,6 +4343,8 @@ io_stats_fsyncdir (call_frame_t *frame, xlator_t *this,
{
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_fsyncdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsyncdir,
@@ -4085,8 +4358,11 @@ io_stats_access (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t mask, dict_t *xdata)
{
ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_access_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->access,
@@ -4101,6 +4377,8 @@ io_stats_ftruncate (call_frame_t *frame, xlator_t *this,
{
START_FOP_LATENCY (frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND (frame, io_stats_ftruncate_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate,
@@ -4128,6 +4406,7 @@ io_stats_fstat (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
ios_track_fd (frame, fd);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fstat_cbk,
@@ -4144,6 +4423,8 @@ io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
{
START_FOP_LATENCY(frame);
+ FOP_THROTTLE (frame, this);
+
STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
xdata);
@@ -4411,13 +4692,73 @@ io_priv (xlator_t *this)
}
int
+init_namespace_conf (struct ios_conf *conf)
+{
+ int ret;
+
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Initializing structures for namespace stats tracking.");
+
+ /* Load namespace conf for the first time. */
+ ret = ios_conf_load_namespace_conf (conf);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+init_namespace_options (struct ios_conf *conf)
+{
+ if (!conf->iambrickd) {
+ conf->throttling_enabled = _gf_false;
+ conf->measure_namespace_rates = _gf_false;
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG,
+ "Disabling throttling because I'm not a brick.");
+ return;
+ }
+
+ /* Can't throttle anything without measuring namespace rates */
+ /* TODO: is there a way to do this same enforcement with the namespace
+ * collection option? */
+ if (conf->throttling_enabled) {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Throttling enabled.");
+ conf->measure_namespace_rates = _gf_true;
+ }
+
+ if (conf->measure_namespace_rates) {
+ if (init_namespace_conf (conf) != 0) {
+ gf_log (GF_IO_STATS, GF_LOG_WARNING,
+ "Disabling namespace measurements even though "
+ "it was marked as enabled, due to failures "
+ "during initialization!");
+ conf->throttling_enabled = _gf_false;
+ conf->measure_namespace_rates = _gf_false;
+ } else {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG, "Namespace rates enabled.");
+ }
+
+ if (conf->audit_creates_and_unlinks || conf->sample_all_errors ||
+ conf->sample_hard_errors) {
+ gf_log (GF_IO_STATS, GF_LOG_WARNING,
+ "Fop sample auditing or 1:1 error sampling is "
+ "enabled, which means that namespace rates might "
+ "be reported incorrectly in some cases!");
+ }
+ }
+}
+
+int
reconfigure (xlator_t *this, dict_t *options)
{
struct ios_conf *conf = NULL;
int ret = -1;
char *sys_log_str = NULL;
char *log_format_str = NULL;
- char *logger_str = NULL;
+ char *logger_str = NULL;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
@@ -4425,7 +4766,6 @@ reconfigure (xlator_t *this, dict_t *options)
int logger = -1;
uint32_t log_buf_size = 0;
uint32_t log_flush_timeout = 0;
- int32_t old_dump_interval;
if (!this || !this->private)
goto out;
@@ -4441,13 +4781,11 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("latency-measurement", conf->measure_latency,
options, bool, out);
- old_dump_interval = conf->ios_dump_interval;
GF_OPTION_RECONF ("ios-dump-interval", conf->ios_dump_interval, options,
int32, out);
- if ((old_dump_interval <= 0) && (conf->ios_dump_interval > 0)) {
- pthread_create (&conf->dump_thread, NULL,
- (void *) &_ios_dump_thread, this);
- }
+
+ GF_OPTION_RECONF ("ns-rate-window", conf->ns_rate_window, options,
+ int32, out);
GF_OPTION_RECONF ("ios-sample-interval", conf->ios_sample_interval,
options, int32, out);
@@ -4484,22 +4822,25 @@ reconfigure (xlator_t *this, dict_t *options)
time, out);
gf_log_set_log_flush_timeout (log_flush_timeout);
- GF_OPTION_RECONF ("fop-sample-enable-audit",
- conf->audit_creates_and_unlinks, options, bool, out);
+ GF_OPTION_RECONF ("fop-sample-enable-audit", conf->audit_creates_and_unlinks, options, bool, out);
+
+ GF_OPTION_RECONF ("fop-sample-all-errors", conf->sample_all_errors, options, bool, out);
+
+ GF_OPTION_RECONF ("fop-sample-hard-errors", conf->sample_hard_errors, options, bool, out);
+
+ GF_OPTION_RECONF ("dump-percentile-latencies", conf->dump_percentile_latencies, options, bool, out);
- GF_OPTION_RECONF ("fop-sample-all-errors",
- conf->sample_all_errors, options, bool, out);
+ GF_OPTION_RECONF ("iam-gfproxy-daemon", conf->iamgfproxyd, options, bool, out);
- GF_OPTION_RECONF ("fop-sample-hard-errors",
- conf->sample_hard_errors, options, bool, out);
+ GF_OPTION_RECONF ("fop-throttling-enable", conf->throttling_enabled, options, bool, out);
- GF_OPTION_RECONF ("dump-percentile-latencies",
- conf->dump_percentile_latencies, options, bool, out);
+ GF_OPTION_RECONF ("measure-namespace-rates", conf->measure_namespace_rates, options, bool, out);
+
+ init_namespace_options (conf);
ret = 0;
out:
- gf_log (this ? this->name : "io-stats",
- GF_LOG_DEBUG, "reconfigure returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "reconfigure returning %d", ret);
return ret;
}
@@ -4529,10 +4870,149 @@ ios_conf_destroy (struct ios_conf *conf)
if (!conf)
return;
+ dict_destroy (conf->hash_to_ns);
+ dict_destroy (conf->throttling_rates);
+ io_stats_destroy_fop_hits_dict (&conf->fop_hits_dict);
ios_destroy_top_stats (conf);
_ios_destroy_dump_thread (conf);
LOCK_DESTROY (&conf->lock);
- GF_FREE(conf);
+ GF_FREE (conf);
+}
+
+int
+ios_conf_parse_namespace_conf_line (char *file_line, dict_t *hash_to_ns,
+ dict_t *throttle_rates)
+{
+ int ret = 0;
+ int scanned = 0;
+ uint32_t ns_hash = 0;
+ int64_t fop_rate = 0;
+ char *ns_name = NULL;
+ char ns_key[DICT_UINT32_KEY_SIZE];
+
+ ns_name = GF_CALLOC (strlen (file_line), sizeof (char), 0);
+
+ if (!ns_name) {
+ gf_log (GF_IO_STATS, GF_LOG_CRITICAL,
+ "Memory allocation error!");
+ ret = ENOMEM;
+ goto out;
+ }
+
+ scanned = sscanf (file_line, "%s %ld", ns_name, &fop_rate);
+
+ if (scanned < 1 || strlen (ns_name) < 1) {
+ /* TODO(mgoulet): Log this line as skipped. */
+ goto out;
+ }
+
+ ns_hash = SuperFastHash (ns_name, strlen (ns_name));
+ dict_uint32_to_key (ns_hash, ns_key);
+ ret = dict_set_dynstr_with_alloc (hash_to_ns, ns_key, ns_name);
+
+ if (ret != 0) {
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Failed to set hash/namespace pair %u -> \'%s\'",
+ ns_hash, ns_name);
+ goto out;
+ } else {
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Loaded namespace %u -> \'%s\'",
+ ns_hash, ns_name);
+ }
+
+ if (scanned == 1) {
+ /* We only detected a namespace, but no throttle rate,
+ * so let's quit now. */
+ goto out;
+ }
+
+ ret = dict_set_int64 (throttle_rates, ns_name, fop_rate);
+
+ if (ret != 0) {
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Failed to set fop rate \'%s\' -> %ld",
+ ns_name, fop_rate);
+ goto out;
+ } else {
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Loaded fop rate \'%s\' -> %ld",
+ ns_name, fop_rate);
+ }
+
+out:
+ if (ns_name) {
+ GF_FREE (ns_name);
+ }
+
+ return ret;
+}
+
+int
+ios_conf_load_namespace_conf (struct ios_conf *conf)
+{
+ int ret = 0;
+ FILE *fp = NULL;
+ char *line = NULL;
+ size_t len = 0;
+ time_t curr_mtime = {0};
+
+ get_file_mtime (_IOS_NAMESPACE_CONF, &curr_mtime);
+ if (conf->namespace_conf_mtime == curr_mtime) {
+ gf_log (GF_IO_STATS, GF_LOG_DEBUG,
+ "%s has not changed, not reloading.",
+ _IOS_NAMESPACE_CONF);
+ return 0;
+ }
+
+ if (!conf->fop_hits_dict) {
+ conf->fop_hits_dict = dict_new ();
+ if (!conf->fop_hits_dict) {
+ ret = ENOMEM;
+ goto done;
+ }
+ }
+
+ if (!conf->hash_to_ns) {
+ conf->hash_to_ns = dict_new ();
+ if (!conf->hash_to_ns) {
+ ret = ENOMEM;
+ goto done;
+ }
+ }
+
+ if (!conf->throttling_rates) {
+ conf->throttling_rates = dict_new ();
+ if (!conf->throttling_rates) {
+ ret = ENOMEM;
+ goto done;
+ }
+ }
+
+ fp = fopen (_IOS_NAMESPACE_CONF, "r");
+ if (!fp) {
+ ret = ENOENT;
+ goto done;
+ }
+
+ gf_log (GF_IO_STATS, GF_LOG_INFO,
+ "Reloading %s from disk.",
+ _IOS_NAMESPACE_CONF);
+
+ while (getline (&line, &len, fp) != -1) {
+ ios_conf_parse_namespace_conf_line (line, conf->hash_to_ns,
+ conf->throttling_rates);
+ }
+
+ free (line);
+ conf->namespace_conf_mtime = curr_mtime;
+
+done:
+ if (fp) {
+ fclose (fp);
+ }
+
+ return ret;
}
int
@@ -4550,6 +5030,7 @@ init (xlator_t *this)
int ret = -1;
uint32_t log_buf_size = 0;
uint32_t log_flush_timeout = 0;
+ glusterfs_ctx_t *ctx = NULL;
if (!this)
return -1;
@@ -4568,11 +5049,19 @@ init (xlator_t *this)
"dangling volume. check volfile ");
}
+ ctx = this->ctx;
+ GF_ASSERT (ctx);
+
+ if (ctx->cmd_args.stats_instance_name) {
+ this->instance_name = ctx->cmd_args.stats_instance_name;
+ }
+
conf = GF_CALLOC (1, sizeof(*conf), gf_io_stats_mt_ios_conf);
if (!conf)
goto out;
+
/*
* Init it just after calloc, so that we are sure the lock is inited
* in case of error paths.
@@ -4595,14 +5084,19 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-gfproxy-daemon", conf->iamgfproxyd, bool, out);
- GF_OPTION_INIT ("fop-sample-hard-errors", conf->sample_hard_errors,
- bool, out);
+ GF_OPTION_INIT ("fop-throttling-enable", conf->throttling_enabled, bool, out);
- GF_OPTION_INIT ("fop-sample-all-errors", conf->sample_all_errors,
- bool, out);
+ GF_OPTION_INIT ("measure-namespace-rates", conf->measure_namespace_rates, bool, out);
+
+ GF_OPTION_INIT ("fop-sample-enable-audit", conf->audit_creates_and_unlinks, bool, out);
+
+ GF_OPTION_INIT ("fop-sample-all-errors", conf->sample_all_errors, bool, out);
- GF_OPTION_INIT ("fop-sample-enable-audit",
- conf->audit_creates_and_unlinks, bool, out);
+ GF_OPTION_INIT ("fop-sample-hard-errors", conf->sample_hard_errors, bool, out);
+
+ GF_OPTION_INIT ("dump-percentile-latencies", conf->dump_percentile_latencies, bool, out);
+
+ init_namespace_options (conf);
GF_OPTION_INIT ("dump-fd-stats", conf->dump_fd_stats, bool, out);
@@ -4614,6 +5108,9 @@ init (xlator_t *this)
GF_OPTION_INIT ("ios-dump-interval", conf->ios_dump_interval,
int32, out);
+ GF_OPTION_INIT ("ns-rate-window", conf->ns_rate_window,
+ int32, out);
+
GF_OPTION_INIT ("ios-sample-interval", conf->ios_sample_interval,
int32, out);
@@ -4639,8 +5136,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("log-level", log_str, str, out);
if (log_str) {
log_level = glusterd_check_log_level (log_str);
- if (DEFAULT_LOG_LEVEL != log_level)
- gf_log_set_loglevel (log_level);
+ gf_log_set_loglevel (log_level);
}
GF_OPTION_INIT ("logger", logger_str, str, out);
@@ -4661,14 +5157,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("log-flush-timeout", log_flush_timeout, time, out);
gf_log_set_log_flush_timeout (log_flush_timeout);
- GF_OPTION_INIT ("dump-percentile-latencies",
- conf->dump_percentile_latencies, bool, out);
-
+
this->private = conf;
- if (conf->ios_dump_interval > 0) {
- pthread_create (&conf->dump_thread, NULL,
- (void *) &_ios_dump_thread, this);
- }
+ pthread_create (&conf->dump_thread, NULL,
+ (void *) &_ios_dump_thread, this);
ret = 0;
out:
if (!this->private) {
@@ -4688,9 +5180,8 @@ fini (xlator_t *this)
return;
conf = this->private;
-
- ios_conf_destroy (conf);
this->private = NULL;
+ ios_conf_destroy (conf);
gf_log (this->name, GF_LOG_INFO,
"io-stats translator unloaded");
return;
@@ -4708,6 +5199,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
double throughput = 0;
double time = 0;
gf_boolean_t is_peek = _gf_false;
+
va_list ap;
dict = data;
@@ -4800,8 +5292,8 @@ notify (xlator_t *this, int32_t event, void *data, ...)
(void) ios_dump_args_init (&args,
IOS_DUMP_TYPE_DICT, output);
- ret = io_stats_dump (this, NULL, &args, op,
- is_peek);
+ ret = io_stats_dump (this, NULL, &args,
+ op, is_peek);
}
}
break;
@@ -4876,21 +5368,13 @@ struct volume_options options[] = {
.description = "If on stats related to file-operations would be "
"tracked inside GlusterFS data-structures."
},
- { .key = { "ios-dump-interval" },
- .type = GF_OPTION_TYPE_INT,
- .min = 0,
- .max = 3600,
- .default_value = "0",
- .description = "Interval (in seconds) at which to auto-dump "
- "statistics. Zero disables automatic dumping."
- },
{ .key = { "ios-sample-interval" },
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 65535,
- .default_value = "0",
+ .default_value = "100",
.description = "Interval in which we want to collect FOP latency "
- "samples. 2 means collect a sample every 2nd FOP."
+ "samples. 2 means collect a sample every 2nd FOP."
},
{ .key = { "ios-sample-buf-size" },
.type = GF_OPTION_TYPE_INT,
@@ -4899,13 +5383,27 @@ struct volume_options options[] = {
.default_value = "65535",
.description = "The maximum size of our FOP sampling ring buffer."
},
+ { .key = { "ios-dump-interval" },
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 3600,
+ .default_value = "5",
+ .description = "Interval in which we want to auto-dump statistics."
+ },
+ { .key = { "ns-rate-window" },
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = FOP_HITS_SIZE,
+ .default_value = "5",
+ .description = "Number of ios-dump-interval length windows to calculate "
+ "our namespace FOP sma with."
+ },
{ .key = { "ios-dnscache-ttl-sec" },
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 3600 * 72,
- .default_value = "86400",
- .description = "The interval after wish a cached DNS entry will be "
- "re-validated. Default: 24 hrs"
+ .default_value = "43200",
+ .description = "Interval in which we want to auto-dump statistics."
},
{ .key = { "latency-measurement" },
.type = GF_OPTION_TYPE_BOOL,
@@ -5026,58 +5524,71 @@ struct volume_options options[] = {
"log messages that can be buffered for a time equal to"
" the value of the option brick-log-flush-timeout."
},
- { .key = {"iam-self-heal-daemon"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option differentiates if the io-stats "
- "translator is running as part of an self-heal "
- "daemon or not."
- },
- { .key = {"iam-nfs-daemon"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option differentiates if the io-stats "
- "translator is running as part of an NFS daemon "
- "or not."
+ { .key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the io-stats "
+ "translator is running as part of self-heal-daemon "
+ "or not."
},
- { .key = {"iam-brick-daemon"},
+ { .key = {"iam-nfs-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "This option differentiates if the io-stats "
- "translator is running as part of brick daemon "
+ "translator is running as part of an NFS daemon "
"or not."
},
+ {
+ .key = {"iam-brick-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the io-stats "
+ "translator is running as part of brick daemon "
+ "or not."
+ },
{ .key = {"iam-gfproxy-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "This option differentiates if the io-stats "
- "translator is running as part of an GFProxy daemon "
+ "translator is running as part of an NFS daemon "
"or not."
},
+ { .key = {"fop-throttling-enable"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option enables whether certain namespaces "
+ "(exports) will be throttled when they exceed their "
+ "allowed FOPs/sec rate."
+ },
+ { .key = {"measure-namespace-rates"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option enables tracking of FOP rates for individual namespaces. "
+ "It is automatically set to true if fop-throttling-enable is enabled."
+ },
{ .key = {"fop-sample-enable-audit"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "This option samples the following FOPs 1:1: "
- "CREATE, UNLINK, MKDIR, RMDIR, TRUNCATE. "
+ .description = "This option samples the following FOPs 1:1: CREATE, UNLINK, MKDIR, RMDIR, TRUNCATE."
+ " fop-sample-interval must be set to something non-zero."
},
{ .key = {"fop-sample-all-errors"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "This option samples all fops that failed."
+ .description = "This option samples all fops with a non-zero op_errno "
+ "value 1:1."
},
{ .key = {"fop-sample-hard-errors"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
- .description = "This option samples all fops with \"hard errors\""
+ .description = "This option samples all fops with \"hard errors\" 1:1,"
"including EROFS, ENOSPC, etc."
},
{ .key = { "dump-percentile-latencies" },
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "If on the p50, p90, p95, and p99 latency of each "
- "operation and all types is dumped at each sample "
- "interval."
+ .description = "If on the p50, p90, p95, and p99 latency of each operation "
+ "and all types is dumped at each sample interval. "
},
{ .key = {NULL} },
-
};
diff --git a/xlators/debug/io-stats/src/io-stats.h b/xlators/debug/io-stats/src/io-stats.h
new file mode 100644
index 00000000000..678f67929f4
--- /dev/null
+++ b/xlators/debug/io-stats/src/io-stats.h
@@ -0,0 +1,381 @@
+#ifndef _IO_STATS_H_
+#define _IO_STATS_H_
+
+#define OFFSET_WRITE_DETECTOR 0
+#define UNKNOWN_WRITE_OFFSET ((off_t)-1)
+
+#define FOP_HITS_SIZE 32
+struct _ios_fop_hits_s {
+ uint64_t idx;
+ uint64_t hits[FOP_HITS_SIZE];
+ double elapsed[FOP_HITS_SIZE];
+ double avg_fops;
+ double avg_latency;
+};
+typedef struct _ios_fop_hits_s ios_fop_hits_t;
+
+ios_fop_hits_t *ios_fop_hits_init ();
+
+#define GF_IO_STATS "io-stats"
+
+#define MAX_LIST_MEMBERS 100
+#define DEFAULT_PWD_BUF_SZ 16384
+#define DEFAULT_GRP_BUF_SZ 16384
+#define IOS_MAX_ERRORS 132
+
+typedef enum {
+ IOS_STATS_TYPE_NONE,
+ IOS_STATS_TYPE_OPEN,
+ IOS_STATS_TYPE_READ,
+ IOS_STATS_TYPE_WRITE,
+ IOS_STATS_TYPE_OPENDIR,
+ IOS_STATS_TYPE_READDIRP,
+ IOS_STATS_TYPE_READ_THROUGHPUT,
+ IOS_STATS_TYPE_WRITE_THROUGHPUT,
+ IOS_STATS_TYPE_MAX
+}ios_stats_type_t;
+
+typedef enum {
+ IOS_STATS_THRU_READ,
+ IOS_STATS_THRU_WRITE,
+ IOS_STATS_THRU_MAX,
+}ios_stats_thru_t;
+
+struct ios_stat_lat {
+ struct timeval time;
+ double throughput;
+};
+
+struct ios_stat {
+ gf_lock_t lock;
+ uuid_t gfid;
+ char *filename;
+ uint64_t counters [IOS_STATS_TYPE_MAX];
+ struct ios_stat_lat thru_counters [IOS_STATS_THRU_MAX];
+ int refcnt;
+#if OFFSET_WRITE_DETECTOR
+ off_t expected_write_offset;
+#endif
+};
+
+struct ios_stat_list {
+ struct list_head list;
+ struct ios_stat *iosstat;
+ double value;
+};
+
+struct ios_stat_head {
+ gf_lock_t lock;
+ double min_cnt;
+ uint64_t members;
+ struct ios_stat_list *iosstats;
+};
+
+typedef struct _ios_sample_t {
+ uid_t uid;
+ gid_t gid;
+ char identifier[UNIX_PATH_MAX];
+ glusterfs_fop_t fop_type;
+ gf_fop_pri_t fop_pri;
+ struct timeval timestamp;
+ double elapsed;
+ ns_info_t ns_info;
+ char path[4096];
+ gf_boolean_t have_path;
+ int32_t op_ret;
+ int32_t op_errno;
+} ios_sample_t;
+
+
+typedef struct _ios_sample_buf_t {
+ uint64_t pos; /* Position in write buffer */
+ uint64_t size; /* Size of ring buffer */
+ uint64_t collected; /* Number of samples we've collected */
+ uint64_t observed; /* Number of FOPs we've observed */
+ ios_sample_t *ios_samples; /* Our list of samples */
+} ios_sample_buf_t;
+
+
+struct ios_lat {
+ double min;
+ double max;
+ double avg;
+ uint64_t total;
+};
+
+struct ios_global_stats {
+ uint64_t data_written;
+ uint64_t data_read;
+ uint64_t block_count_write[32];
+ uint64_t block_count_read[32];
+ uint64_t fop_hits[GF_FOP_MAXVALUE];
+ struct timeval started_at;
+ struct ios_lat latency[GF_FOP_MAXVALUE];
+ uint64_t errno_count[IOS_MAX_ERRORS];
+ uint64_t nr_opens;
+ uint64_t max_nr_opens;
+ struct timeval max_openfd_time;
+#if OFFSET_WRITE_DETECTOR
+ uint64_t total_write_ops;
+ uint64_t offset_write_ops;
+#endif
+};
+
+struct ios_conf {
+ gf_lock_t lock;
+ struct ios_global_stats cumulative;
+ uint64_t increment;
+ struct ios_global_stats incremental;
+ gf_boolean_t dump_fd_stats;
+ gf_boolean_t count_fop_hits;
+ gf_boolean_t measure_latency;
+ struct ios_stat_head list[IOS_STATS_TYPE_MAX];
+ struct ios_stat_head thru_list[IOS_STATS_THRU_MAX];
+ int32_t ios_dump_interval;
+ int32_t ns_rate_window;
+ pthread_t dump_thread;
+ gf_boolean_t dump_thread_should_die;
+ gf_lock_t ios_sampling_lock;
+ int32_t ios_sample_interval;
+ int32_t ios_sample_buf_size;
+ ios_sample_buf_t *ios_sample_buf;
+ struct dnscache *dnscache;
+ int32_t ios_dnscache_ttl_sec;
+ dict_t *hash_to_ns; /* Hash -> NS name */
+ dict_t *fop_hits_dict; /* NS -> Real rate */
+ dict_t *throttling_rates; /* NS -> Max rate */
+ gf_boolean_t iamgfproxyd;
+ gf_boolean_t iamnfsd;
+ gf_boolean_t iamshd;
+ gf_boolean_t iambrickd;
+ gf_boolean_t throttling_enabled;
+ time_t namespace_conf_mtime;
+ gf_boolean_t measure_namespace_rates;
+ gf_boolean_t audit_creates_and_unlinks;
+ gf_boolean_t sample_hard_errors;
+ gf_boolean_t dump_percentile_latencies;
+ gf_boolean_t sample_all_errors;
+ uint32_t outstanding_req;
+};
+
+
+struct ios_fd {
+ char *filename;
+ uint64_t data_written;
+ uint64_t data_read;
+ uint64_t block_count_write[32];
+ uint64_t block_count_read[32];
+ struct timeval opened_at;
+};
+
+typedef enum {
+ IOS_DUMP_TYPE_NONE = 0,
+ IOS_DUMP_TYPE_FILE = 1,
+ IOS_DUMP_TYPE_DICT = 2,
+ IOS_DUMP_TYPE_JSON_FILE = 3,
+ IOS_DUMP_TYPE_SAMPLES = 4,
+ IOS_DUMP_TYPE_MAX = 5
+} ios_dump_type_t;
+
+struct ios_dump_args {
+ xlator_t *this;
+ ios_dump_type_t type;
+ union {
+ FILE *logfp;
+ dict_t *dict;
+ } u;
+};
+
+typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*,
+ int, int, uint64_t);
+
+// Will be present in frame->local
+struct ios_local {
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+};
+
+#define ios_local_new() GF_CALLOC (1, sizeof (struct ios_local), gf_common_mt_char);
+
+void
+ios_local_free (struct ios_local *local)
+{
+ if (!local)
+ return;
+
+ inode_unref (local->inode);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ loc_wipe (&local->loc);
+ memset (local, 0, sizeof (*local));
+ GF_FREE (local);
+}
+
+struct volume_options options[];
+
+inline static int
+is_fop_latency_started (call_frame_t *frame)
+{
+ GF_ASSERT (frame);
+ struct timeval epoch = {0, };
+ return memcmp (&frame->begin, &epoch, sizeof (epoch));
+}
+
+const char *_IOS_DUMP_DIR = "/var/lib/glusterd/stats";
+const char *_IOS_SAMP_DIR = "/var/log/glusterfs/samples";
+const char *_IOS_NAMESPACE_CONF = TOSTRING (GLUSTERD_WORKDIR) "/namespaces.conf";
+
+extern const char *__progname;
+
+/* This is a list of errors which are in some way critical.
+ * It is useful to sample these errors even if other errors
+ * should be ignored. */
+const int32_t ios_hard_error_list[] = {
+ EIO,
+ EROFS,
+ ENOSPC,
+ ENOTCONN,
+ ESTALE,
+};
+
+#define IOS_HARD_ERROR_LIST_SIZE (sizeof(ios_hard_error_list) / sizeof(int32_t))
+
+const char *errno_to_name[IOS_MAX_ERRORS] = {
+ "success", /* 0 */
+ "eperm",
+ "enoent",
+ "esrch",
+ "eintr",
+ "eio",
+ "enxio",
+ "e2big",
+ "enoexec",
+ "ebadf",
+ "echild",
+ "eagain",
+ "enomem",
+ "eacces",
+ "efault",
+ "enotblk",
+ "ebusy",
+ "eexist",
+ "exdev",
+ "enodev",
+ "enotdir",
+ "eisdir",
+ "einval",
+ "enfile",
+ "emfile",
+ "enotty",
+ "etxtbsy",
+ "efbig",
+ "enospc",
+ "espipe",
+ "erofs",
+ "emlink",
+ "epipe",
+ "edom",
+ "erange",
+ "edeadlk",
+ "enametoolong",
+ "enolck",
+ "enosys",
+ "enotempty",
+ "eloop",
+ "ewouldblock",
+ "enomsg",
+ "eidrm",
+ "echrng",
+ "el2nsync",
+ "el3hlt",
+ "el3rst",
+ "elnrng",
+ "eunatch",
+ "enocsi",
+ "el2hlt",
+ "ebade",
+ "ebadr",
+ "exfull",
+ "enoano",
+ "ebadrqc",
+ "ebadslt",
+ "edeadlock",
+ "ebfont",
+ "enostr",
+ "enodata",
+ "etime",
+ "enosr",
+ "enonet",
+ "enopkg",
+ "eremote",
+ "enolink",
+ "eadv",
+ "esrmnt",
+ "ecomm",
+ "eproto",
+ "emultihop",
+ "edotdot",
+ "ebadmsg",
+ "eoverflow",
+ "enotuniq",
+ "ebadfd",
+ "eremchg",
+ "elibacc",
+ "elibbad",
+ "elibscn",
+ "elibmax",
+ "elibexec",
+ "eilseq",
+ "erestart",
+ "estrpipe",
+ "eusers",
+ "enotsock",
+ "edestaddrreq",
+ "emsgsize",
+ "eprototype",
+ "enoprotoopt",
+ "eprotonosupport",
+ "esocktnosupport",
+ "eopnotsupp",
+ "epfnosupport",
+ "eafnosupport",
+ "eaddrinuse",
+ "eaddrnotavail",
+ "enetdown",
+ "enetunreach",
+ "enetreset",
+ "econnaborted",
+ "econnreset",
+ "enobufs",
+ "eisconn",
+ "enotconn",
+ "eshutdown",
+ "etoomanyrefs",
+ "etimedout",
+ "econnrefused",
+ "ehostdown",
+ "ehostunreach",
+ "ealready",
+ "einprogress",
+ "estale",
+ "euclean",
+ "enotnam",
+ "enavail",
+ "eisnam",
+ "eremoteio",
+ "edquot",
+ "enomedium",
+ "emediumtype",
+ "ecanceled",
+ "enokey",
+ "ekeyexpired",
+ "ekeyrevoked",
+ "ekeyrejected",
+ "eownerdead",
+ "enotrecoverable"
+};
+
+#endif /* _IO_STATS_H_ */
diff --git a/xlators/features/namespace/src/namespace.c b/xlators/features/namespace/src/namespace.c
index 639dfa33fe2..4585087025e 100644
--- a/xlators/features/namespace/src/namespace.c
+++ b/xlators/features/namespace/src/namespace.c
@@ -143,7 +143,7 @@ ns_inode_ctx_put (inode_t *inode, xlator_t *this, ns_info_t *info)
int ret = -1;
if (!inode || !this) {
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_DEBUG,
"Need a valid inode and xlator to cache ns_info.");
ret = -1;
goto out;
@@ -253,10 +253,10 @@ get_path_resume_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"G>P %s %10u namespace found %s",
uuid_utoa (local->loc.inode->gfid), info->hash, path);
} else if (ret == PATH_PARSE_RESULT_NO_PATH) {
- gf_log (this->name, GF_LOG_WARNING, "G>P %s has no path",
+ gf_log (this->name, GF_LOG_DEBUG, "G>P %s has no path",
uuid_utoa (local->loc.inode->gfid));
} else if (ret == PATH_PARSE_RESULT_IS_GFID) {
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_DEBUG,
"G>P %s winding failed, still have gfid",
uuid_utoa (local->loc.inode->gfid));
}
@@ -303,16 +303,26 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc
* from the inode context, then from the loc's path itself. */
if (!loc || !loc->path || !loc->inode) {
ret = PATH_PARSE_RESULT_NO_PATH;
- } else if (!ns_inode_ctx_get (loc->inode, this, info)) {
+ goto out;
+ }
+
+ if (!ns_inode_ctx_get (loc->inode, this, info)) {
ret = PATH_PARSE_RESULT_FOUND;
- } else {
- ret = parse_path (info, loc->path);
- gf_log (this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s",
- fn, loc->path);
+ goto out;
+ }
- if (ret == PATH_PARSE_RESULT_FOUND) {
- ns_inode_ctx_put (loc->inode, this, info);
- }
+ if (gf_uuid_is_null (loc->inode->gfid) && gf_uuid_is_null (loc->gfid)) {
+ ret = PATH_PARSE_RESULT_NO_PATH;
+ goto out;
+ }
+
+ ret = parse_path (info, loc->path);
+ gf_log (this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s",
+ fn, loc->path);
+
+ if (ret == PATH_PARSE_RESULT_FOUND) {
+ ns_inode_ctx_put (loc->inode, this, info);
+ goto out;
}
/* Keep trying by calling inode_path next, making sure to copy
@@ -337,6 +347,7 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc
}
}
+out:
/* Report our status, and if we have a GFID, we'll eventually try a
* GET_ANCESTRY_PATH_KEY wind when we return from this function. */
if (ret == PATH_PARSE_RESULT_FOUND) {
@@ -344,7 +355,7 @@ set_ns_from_loc (const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc
"%s: LOC %s %10u namespace found for %s", fn,
uuid_utoa (loc->inode->gfid), info->hash, loc->path);
} else if (ret == PATH_PARSE_RESULT_NO_PATH) {
- gf_log (this->name, GF_LOG_WARNING, "%s: LOC has no path", fn);
+ gf_log (this->name, GF_LOG_DEBUG, "%s: LOC has no path", fn);
} else if (ret == PATH_PARSE_RESULT_IS_GFID) {
/* Make sure to copy the inode's gfid for the eventual wind. */
if (gf_uuid_is_null (loc->inode->gfid)) {
@@ -381,9 +392,20 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd)
* from the inode context, then inode_path. */
if (!fd || !fd->inode) {
ret = PATH_PARSE_RESULT_NO_PATH;
- } else if (!ns_inode_ctx_get (fd->inode, this, info)) {
+ goto out;
+ }
+
+ if (!ns_inode_ctx_get (fd->inode, this, info)) {
ret = PATH_PARSE_RESULT_FOUND;
- } else if (inode_path (fd->inode, NULL, &path) >= 0 && path) {
+ goto out;
+ }
+
+ if (gf_uuid_is_null (fd->inode->gfid)) {
+ ret = PATH_PARSE_RESULT_NO_PATH;
+ goto out;
+ }
+
+ if (inode_path (fd->inode, NULL, &path) >= 0 && path) {
ret = parse_path (info, path);
gf_log (this->name, GF_LOG_DEBUG, "%s: FD retrieved path %s",
fn, path);
@@ -397,13 +419,14 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd)
GF_FREE (path);
}
+out:
/* Report our status, and if we have a GFID, we'll eventually try a
* GET_ANCESTRY_PATH_KEY wind when we return from this function. */
if (ret == PATH_PARSE_RESULT_FOUND) {
gf_log (this->name, GF_LOG_DEBUG, "%s: FD %s %10u namespace found",
fn, uuid_utoa (fd->inode->gfid), info->hash);
} else if (ret == PATH_PARSE_RESULT_NO_PATH) {
- gf_log (this->name, GF_LOG_WARNING, "%s: FD has no path", fn);
+ gf_log (this->name, GF_LOG_DEBUG, "%s: FD has no path", fn);
} else if (ret == PATH_PARSE_RESULT_IS_GFID) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: FD %s winding, looking for path",
@@ -442,6 +465,12 @@ set_ns_from_fd (const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd)
goto wind; \
} \
\
+ if (gf_uuid_is_null (inode->gfid)) { \
+ gf_log (this->name, GF_LOG_DEBUG, \
+ "Cannot wind on a NULL GFID."); \
+ goto wind; \
+ } \
+ \
new_frame->root->uid = new_frame->root->gid = 0; \
/* Put a phony "not found" NS info into this call. */ \
new_frame->root->ns_info = *info; \
@@ -750,8 +779,10 @@ wind:
int32_t
ns_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ ns_private_t *priv = (ns_private_t *)this->private;
+
path_parse_result_t ret = set_ns_from_loc (__FUNCTION__, frame, this, loc);
- if (ret == PATH_PARSE_RESULT_IS_GFID) {
+ if (ret == PATH_PARSE_RESULT_IS_GFID && priv->wind_lookups) {
GET_ANCESTRY_PATH_WIND (lookup, loc->inode, loc, xdata);
return 0;
}
@@ -1229,6 +1260,7 @@ init (xlator_t *this)
}
GF_OPTION_INIT ("tag-namespaces", priv->tag_namespaces, bool, out);
+ GF_OPTION_INIT ("wind-lookups", priv->wind_lookups, bool, out);
gf_log (this->name, GF_LOG_INFO, "Namespace xlator loaded");
this->private = priv;
@@ -1262,6 +1294,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("tag-namespaces", priv->tag_namespaces, options, bool,
out);
+ GF_OPTION_RECONF ("wind-lookups", priv->wind_lookups, options,
+ bool, out);
ret = 0;
out:
@@ -1331,5 +1365,16 @@ struct volume_options options[] = {
"that tags every fop with a namespace hash for later "
"throttling, stats collection, logging, etc.",
},
+ {
+ .key = { "wind-lookups" },
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "If enabled, a GET_ANCESTRY_PATH_KEY getxattr wind "
+ "will be scheduled for a LOOKUP. Since we have many "
+ "LOOKUP requests which are sent for files that don't "
+ "exist, it's often benificial to disable this option, "
+ "so we don't have a high percentage of bad GETXATTRs "
+ "in io-stats.",
+ },
{.key = { NULL } },
};
diff --git a/xlators/features/namespace/src/namespace.h b/xlators/features/namespace/src/namespace.h
index 3b9f782cb1a..affaba04681 100644
--- a/xlators/features/namespace/src/namespace.h
+++ b/xlators/features/namespace/src/namespace.h
@@ -13,6 +13,7 @@
typedef struct {
gf_boolean_t tag_namespaces;
+ gf_boolean_t wind_lookups;
} ns_private_t;
typedef struct {
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 6fcf6892d27..a07fd9c3232 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -2447,8 +2447,8 @@ out:
* the topology of the brick graph */
static volgen_brick_xlator_t server_graph_table[] = {
{brick_graph_add_server, NULL},
- {brick_graph_add_decompounder, "decompounder"},
{brick_graph_add_io_stats, "NULL"},
+ {brick_graph_add_decompounder, "decompounder"},
{brick_graph_add_namespace, "namespace"},
{brick_graph_add_cdc, NULL},
{brick_graph_add_quota, "quota"},
diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am
index 1d09eace2ed..bdbd0290fd6 100644
--- a/xlators/performance/io-threads/src/Makefile.am
+++ b/xlators/performance/io-threads/src/Makefile.am
@@ -1,15 +1,16 @@
xlator_LTLIBRARIES = io-threads.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-io_threads_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+io_threads_la_LDFLAGS = -module -avoid-version
io_threads_la_SOURCES = io-threads.c
io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = io-threads.h iot-mem-types.h io-threads-messages.h
+noinst_HEADERS = io-threads.h iot-mem-types.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -DGLUSTERD_WORKDIR=$(GLUSTERD_WORKDIR)
AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
+CLEANFILES =
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 04d94691ba3..c0ebdd33caf 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -8,6 +8,11 @@
cases as published by the Free Software Foundation.
*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
#include "call-stub.h"
#include "defaults.h"
#include "glusterfs.h"
@@ -20,12 +25,14 @@
#include <sys/time.h>
#include <time.h>
#include "locking.h"
-#include "io-threads-messages.h"
#include "timespec.h"
+#include "hashfn.h"
void *iot_worker (void *arg);
int iot_workers_scale (iot_conf_t *conf);
int __iot_workers_scale (iot_conf_t *conf);
+int32_t iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight);
+int32_t __iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight);
struct volume_options options[];
#define IOT_FOP(name, frame, this, args ...) \
@@ -50,84 +57,622 @@ struct volume_options options[];
} \
} while (0)
+gf_boolean_t
+__iot_ns_queue_empty (iot_ns_queue_t *queue)
+{
+ return (queue->size == 0);
+}
+
+/* Fetch a namespace queue for the given ns_info struct. If the hash was not
+ * found or the clock is not enabled, then return NULL which will be substituted
+ * by the unknown queue. */
+iot_ns_queue_t *
+__iot_get_ns_queue (iot_conf_t *conf, ns_info_t *info)
+{
+ char ns_key[DICT_UINT32_KEY_SIZE];
+ int32_t ret = -1;
+ iot_ns_queue_t *queue = NULL;
+
+ if (!conf->ns_weighted_queueing || !info->found) {
+ gf_log (GF_IO_THREADS, GF_LOG_TRACE,
+ "Namespace not found for %u", info->hash);
+ return NULL;
+ }
+
+ dict_uint32_to_key (info->hash, ns_key);
+ ret = dict_get_ptr (conf->ns_queues, ns_key, (void **)&queue);
+
+ if (ret) {
+ gf_log (GF_IO_THREADS, GF_LOG_TRACE,
+ "Namespace not found for %u", info->hash);
+ }
+
+ /* If successful, return `queue`. */
+ return (!ret && queue) ? queue : NULL;
+}
+
+/* When we parse a new namespace conf file, we create a whole new set of queue
+ * structs; however, old requests may be sitting on old queues. This function
+ * drains the old requests lists into the new queue, or alternatively appends
+ * it onto the unknown queue if there is no corresponding new queue.
+ * (i.e. if it was removed from the conf file) */
+int
+__iot_drain_ns_queue_foreach (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+ iot_conf_t *conf = data;
+ dict_t *new_dict = conf->ns_queues;
+ iot_ns_queue_t *old_queue = data_to_ptr (value);
+ iot_ns_queue_t *new_queue = NULL;
+ int i;
+
+ ret = dict_get_ptr (new_dict, key, (void **)&new_queue);
+
+ /* Don't drain the unknown queue. */
+ if (old_queue == conf->ns_unknown_queue) {
+ return 0;
+ }
+
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ /* If we didn't find a "new queue" corresponding to the old,
+ * then drain into the unknown queue of that priority level. */
+ if (!ret && new_queue) {
+ list_append_init (&old_queue[i].reqs,
+ &new_queue[i].reqs);
+ new_queue[i].size += old_queue[i].size;
+ } else {
+ list_append_init (&old_queue[i].reqs,
+ &conf->ns_unknown_queue[i].reqs);
+ conf->ns_unknown_queue[i].size += old_queue[i].size;
+ }
+ }
+
+ return 0;
+}
+
+/* Drain the namespace queues in old_dict, if there are any. Then free the dict
+ * and clear the clock structs. */
+void
+__iot_drain_and_clear_clock (iot_conf_t *conf, dict_t *old_dict)
+{
+ int i;
+
+ if (old_dict) {
+ dict_foreach (old_dict, __iot_drain_ns_queue_foreach, conf);
+ dict_destroy (old_dict);
+ }
+
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ GF_FREE (conf->ns_clocks[i].slots);
+ conf->ns_clocks[i].slots = NULL;
+ conf->ns_clocks[i].idx = 0;
+ conf->ns_clocks[i].size = 0;
+ }
+}
+
+/* Parse a single namespace conf line. This constructs a new queue and puts it
+ * into the namespace dictionary as well, skipping duplicated namespaces with
+ * a warning. */
+int32_t
+__iot_ns_parse_conf_line (iot_conf_t *conf, char *file_line, uint32_t *total_weight)
+{
+ char ns_key[DICT_UINT32_KEY_SIZE];
+ char *ns_name = NULL;
+ iot_ns_queue_t *queue = NULL;
+ int32_t queue_weight = 1;
+ uint32_t ns_hash = 0;
+ int i, ret = -1, scanned = -1;
+
+ ns_name = GF_CALLOC (strlen (file_line), sizeof (char), 0);
+ if (!ns_name) {
+ gf_log (GF_IO_THREADS, GF_LOG_WARNING,
+ "Memory allocation error!");
+ ret = ENOMEM;
+ goto out;
+ }
+
+ /* Scan the line, skipping the second column which corresponds to a
+ * throttling rate, which we don't particularly care about. */
+ scanned = sscanf (file_line, "%s %*d %d", ns_name, &queue_weight);
+ if (scanned < 1 || strlen (ns_name) < 1) {
+ gf_log (GF_IO_THREADS, GF_LOG_WARNING,
+ "Empty or malformatted line \"%s\" while parsing", file_line);
+ goto out;
+ }
+
+ /* Hash the namespace name, convert it to a key, then search the dict
+ * for an entry matching this key. */
+ ns_hash = SuperFastHash (ns_name, strlen (ns_name));
+
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "Parsed namespace \'%s\' (%u)", ns_name, ns_hash);
+
+ dict_uint32_to_key (ns_hash, ns_key);
+ ret = dict_get_ptr (conf->ns_queues, ns_key, (void **)&queue);
+ if (!ret && queue) {
+ gf_log (GF_IO_THREADS, GF_LOG_WARNING,
+ "Duplicate-hashed queue found for namespace %s", ns_name);
+ /* Since ret == 0, we won't free the queue inadvertently. */
+ goto out;
+ }
+
+ queue = GF_CALLOC (IOT_PRI_MAX, sizeof (iot_ns_queue_t), 0);
+ if (!queue) {
+ gf_log (GF_IO_THREADS, GF_LOG_WARNING,
+ "Memory allocation error!");
+ ret = -(ENOMEM);
+ goto out;
+ }
+
+ /* Init queues. */
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ INIT_LIST_HEAD (&queue[i].reqs);
+ queue[i].hash = ns_hash;
+ queue[i].weight = conf->ns_default_weight;
+ queue[i].size = 0;
+ }
+
+ ret = dict_set_ptr (conf->ns_queues, ns_key, queue);
+ if (ret) {
+ goto out;
+ }
+
+ ret = dict_set_dynstr_with_alloc (conf->hash_to_ns, ns_key, ns_name);
+ if (ret) {
+ goto out;
+ }
+
+ if (scanned < 2 || queue_weight < 1) {
+ gf_log (GF_IO_THREADS, GF_LOG_WARNING,
+ "No weight (or too low) found in config line for namespace %s, "
+ "defaulting to weight of %u.", ns_name, conf->ns_default_weight);
+ } else {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "Parsed weight \'%s\' = %u", ns_name, queue_weight);
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ queue[i].weight = (uint32_t) queue_weight;
+ }
+ }
+
+ *total_weight += queue->weight;
+
+out:
+ if (ns_name) {
+ GF_FREE (ns_name);
+ }
+
+ if (ret && queue) {
+ GF_FREE (queue);
+ }
+
+ return ret;
+}
+
+/* This function (re)initializes the clock that is used by the en/de-queue
+ * operations. */
+void
+__iot_reinit_ns_conf (iot_conf_t *conf)
+{
+ char ns_unknown_key[DICT_UINT32_KEY_SIZE];
+ dict_t *old_dict, *new_dict;
+ FILE *fp = NULL;
+ char *line = NULL;
+ size_t len = 0;
+ uint32_t total_weight;
+ int i, ret = 0;
+
+ if (!conf) {
+ return;
+ }
+
+ if (conf->ns_weighted_queueing) {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "Loading %s from disk.",
+ _IOT_NAMESPACE_CONF);
+
+ fp = fopen (_IOT_NAMESPACE_CONF, "r");
+ if (!fp) {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "Cannot open file for reading.");
+ ret = ENOENT;
+ goto out;
+ }
+
+ /* Save the old queues; we will need to drain old requests out
+ * of it once we make new queues. */
+ old_dict = conf->ns_queues;
+ conf->ns_queues = new_dict = get_new_dict ();
+
+ /* Include the unknown queue weight, which isn't a parsed line. */
+ total_weight = conf->ns_default_weight;
+
+ /* Parse the new config file line-by-line, making a new queue
+ * for each namespace that is parsed. */
+ while (getline (&line, &len, fp) != -1) {
+ __iot_ns_parse_conf_line (conf, line, &total_weight);
+ }
+ free (line);
+
+ /* Drain old queues into new queues, or into unknown queue. */
+ __iot_drain_and_clear_clock (conf, old_dict);
+
+ /* We will add the unknown queue manually into the dictionaries. */
+ dict_uint32_to_key (0, ns_unknown_key);
+ ret = dict_set_dynstr_with_alloc (conf->hash_to_ns,
+ ns_unknown_key, "unknown");
+ if (ret) {
+ goto out;
+ }
+
+ /* Set the ns_unknown_queue as static pointer so it's not freed
+ * in the queue drain step next time the automation. */
+ ret = dict_set_static_ptr (conf->ns_queues, ns_unknown_key,
+ conf->ns_unknown_queue);
+ if (ret) {
+ goto out;
+ }
+
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ ret = __iot_reinit_clock (conf, i, total_weight);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ /* Finally, keep our conf struct updated so we don't spuriously
+ * reconfigure the clock. */
+ get_file_mtime (_IOT_NAMESPACE_CONF, &conf->ns_conf_mtime);
+ }
+
+ ret = 0;
+out:
+ if (fp) {
+ fclose (fp);
+ }
+
+ if (ret) {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "There was an error loading the namespaces conf file, "
+ "disabling clock.");
+ conf->ns_weighted_queueing = _gf_false;
+ }
+
+ /* If our clock isn't enabled (or it was disabled because of an error)
+ * then drain the queues if there are any. */
+ if (!conf->ns_weighted_queueing) {
+ old_dict = conf->ns_queues;
+ /* This NULL signals to the drain that we're not populating any
+ * new queues... */
+ conf->ns_queues = NULL;
+ __iot_drain_and_clear_clock (conf, old_dict);
+ }
+}
+
+/* This is a simple iterative algorithm which tries to allocate the lowest
+ * amount of slots that maintains the same proportional amount of work given
+ * to each namespace.
+ *
+ * Each namespace has a weight, and the proportion of "weight / total weight"
+ * is something we'll call the "ideal" work ratio. If we have no latency and
+ * infinite queues, this ratio is the amount of requests we serve (out of total)
+ * for the given namespace.
+ *
+ * For a given namespace idx i, let the namespace's weight be W_i, and the total
+ * weight be TW. The "ideal" percentage is thus given by W_i/TW. Now if we
+ * choose some arbitrary total number of slots TS, the number of slots we
+ * should give to the namespace is given by S_i = TS*(W_i/TW). Since this is
+ * probably not an integer, we'll actually floor the number, meaning that the
+ * sum of S_i for each namespace most likely doesn't equal the TS we chose
+ * earlier. Let this "real" total sum of slots be RS.
+ *
+ * Now we have the concept of an "realized" percentage given by the ratio of
+ * _allocated slots_ instead of just ideal weights, given by S_i/RS. We consider
+ * this set of slot allocations to be ideal if these two ratios (slots and
+ * weights) are "close enough", given by our ns-weight-tolerance option.
+ *
+ * We then use a loop to distribute the shares, using some modulo magic to
+ * get a good, semi-even distribution in the slots array. The main concern here
+ * is trying to make sure that no single share is bunched up.
+ * If we have namespaces A and B with 2 and 8 slots respectively, we should
+ * shoot for a distribution like [A B B B B A B B B] unstead of like
+ * [A A B B B B B B B B]. This loop tries its best to do that. We don't want
+ * to shuffle randomly either, since there is still a risk of having a bad
+ * bunching if our RNG is bad or we're unlucky... */
+int32_t
+__iot_reinit_clock (iot_conf_t *conf, int i, uint32_t total_weight)
+{
+ int32_t ret = -1;
+ uint32_t try_slots, total_slots, slots_idx, rep;
+ data_pair_t *curr = NULL;
+ iot_ns_queue_t *curr_queue = NULL;
+ iot_ns_queue_t **slots = NULL;
+ char *ns_name = NULL;
+ gf_boolean_t fail;
+ double real_percentage;
+
+ /* Initialize the "ideal" percentage each queue should have. */
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+
+ curr_queue->percentage = ((double) curr_queue->weight)
+ / total_weight;
+ }
+
+ /* We try to satisfy each percentage within some margin, first trying
+ * 1 slot, until we get up to the total sum of all weights, which will
+ * obviously satisfy each percentage but in many cases is far too large
+ * for a slots matrix. */
+ for (try_slots = 1; try_slots <= total_weight; try_slots++) {
+ fail = _gf_false;
+ total_slots = 0;
+
+ /* Calculate how many slots each namespace much get. Since we're
+ * rounding integers, we keep track of the actual total number
+ * of slots in `total_slots`. */
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+
+ curr_queue->slots = (int) (curr_queue->percentage * try_slots);
+ total_slots += curr_queue->slots;
+
+ /* If we've allocated less than 1 slot for the queue,
+ * we should find a larger size. */
+ if (curr_queue->slots < 1) {
+ fail = _gf_true;
+ break;
+ }
+ }
+
+ if (fail) {
+ continue;
+ }
+
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+
+ real_percentage = ((double) curr_queue->slots) / total_slots;
+ /* If the realized percentage is more than ns_weight_tolerance
+ * percent away from the ideal percentage, then let's try
+ * another number. */
+ if (abs (real_percentage - curr_queue->percentage) * 100.0
+ > conf->ns_weight_tolerance) {
+ fail = _gf_true;
+ break;
+ }
+ }
+
+ if (!fail) {
+ break;
+ }
+ }
+
+ /* Report the fits that we have found. */
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+
+ real_percentage = ((double) curr_queue->slots) / total_slots;
+ ret = dict_get_str (conf->hash_to_ns, curr->key, &ns_name);
+
+ if (ret || !ns_name) {
+ continue;
+ }
+
+ gf_log (GF_IO_THREADS, GF_LOG_INFO,
+ "Initializing namespace \'%s\' (weight: %d) with %d slots. "
+ "Ideal percentage: %0.2f%%, real percentage: %0.2f%%.",
+ ns_name, curr_queue->weight, curr_queue->slots,
+ curr_queue->percentage * 100.0, real_percentage * 100.0);
+ }
+
+ /* At this point, we've either found a good fit, or gotten all the way to
+ * total_weight. In either case, we can start allocating slots. */
+ slots = GF_CALLOC (total_slots, sizeof (iot_ns_queue_t *), 0);
+ slots_idx = 0;
+ rep = 0;
+
+ if (!slots) {
+ ret = -(ENOMEM);
+ goto out;
+ }
+
+ /* Allocate slots, with some fun modulo-math to make sure that they're
+ * well distributed. */
+ while (total_slots != slots_idx) {
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+
+ if (curr_queue->slots == 0) {
+ continue;
+ }
+
+ if (rep % (total_slots / curr_queue->slots) == 0) {
+ slots[slots_idx++] = curr_queue;
+ curr_queue->slots--;
+ }
+ }
+
+ rep++;
+ }
+
+ /* Set the slots into the queue, and we're ready to go! */
+ conf->ns_clocks[i].slots = slots;
+ conf->ns_clocks[i].size = total_slots;
+ conf->ns_clocks[i].idx = 0;
+
+ ret = 0;
+out:
+ if (ret && slots) {
+ GF_FREE (slots);
+ }
+
+ return ret;
+}
+
+
+void *
+iot_reinit_ns_conf_thread (void *arg)
+{
+ xlator_t *this = arg;
+ iot_conf_t *conf = this->private;
+ time_t curr_mtime = {0, };
+
+ while (_gf_true) {
+ sleep (conf->ns_conf_reinit_secs);
+
+ get_file_mtime (_IOT_NAMESPACE_CONF, &curr_mtime);
+ if (conf->ns_weighted_queueing && curr_mtime != conf->ns_conf_mtime) {
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ pthread_mutex_lock (&conf->mutex);
+ {
+ __iot_reinit_ns_conf (conf);
+ }
+ pthread_mutex_unlock (&conf->mutex);
+ pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+ } else {
+ gf_log (GF_IO_THREADS, GF_LOG_DEBUG,
+ "Config file %s not modified, skipping.",
+ _IOT_NAMESPACE_CONF);
+ }
+ }
+}
+
+static void
+start_iot_reinit_ns_conf_thread (xlator_t *this)
+{
+ iot_conf_t *priv = this->private;
+ int ret;
+
+ if (!priv) {
+ return;
+ }
+
+ if (priv->reinit_ns_conf_thread_running) {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO, "reinit_ns_conf_thread already started.");
+ return;
+ }
+
+ gf_log (GF_IO_THREADS, GF_LOG_INFO, "Starting reinit_ns_conf_thread.");
+
+ ret = pthread_create (&priv->reinit_ns_conf_thread, NULL, iot_reinit_ns_conf_thread, this);
+ if (ret == 0) {
+ priv->reinit_ns_conf_thread_running = _gf_true;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pthread_create(iot_reinit_ns_conf_thread) failed");
+ }
+}
+
+static void
+stop_iot_reinit_ns_conf_thread (xlator_t *this)
+{
+ iot_conf_t *priv = this->private;
+
+ if (!priv) {
+ return;
+ }
+
+ if (!priv->reinit_ns_conf_thread_running) {
+ gf_log (GF_IO_THREADS, GF_LOG_INFO, "reinit_ns_conf_thread already stopped.");
+ return;
+ }
+
+ gf_log (GF_IO_THREADS, GF_LOG_INFO, "Stopping reinit_ns_conf_thread.");
+
+ if (pthread_cancel (priv->reinit_ns_conf_thread) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pthread_cancel(iot_reinit_ns_conf_thread) failed");
+ }
+
+ if (pthread_join (priv->reinit_ns_conf_thread, NULL) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pthread_join(iot_reinit_ns_conf_thread) failed");
+ }
+
+ /* Failure probably means it's already dead. */
+ priv->reinit_ns_conf_thread_running = _gf_false;
+}
+
call_stub_t *
-__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep)
+__iot_dequeue (iot_conf_t *conf, int *pri)
{
- call_stub_t *stub = NULL;
- int i = 0;
- struct timeval curtv = {0,}, difftv = {0,};
+ call_stub_t *stub = NULL;
+ iot_ns_clock_t *curr_clock;
+ iot_ns_queue_t *queue = NULL;
+ int i, start_idx;
*pri = -1;
- sleep->tv_sec = 0;
- sleep->tv_nsec = 0;
+
for (i = 0; i < IOT_PRI_MAX; i++) {
- if (list_empty (&conf->reqs[i]) ||
- (conf->ac_iot_count[i] >= conf->ac_iot_limit[i]))
+ if (conf->ac_iot_count[i] >= conf->ac_iot_limit[i] ||
+ conf->queue_sizes[i] == 0) {
+ /* If we have too many workers currently serving this
+ * priority level, or no reqs at this level, skip. */
continue;
+ }
- if (i == IOT_PRI_LEAST) {
- pthread_mutex_lock(&conf->throttle.lock);
- if (!conf->throttle.sample_time.tv_sec) {
- /* initialize */
- gettimeofday(&conf->throttle.sample_time, NULL);
- } else {
- /*
- * Maintain a running count of least priority
- * operations that are handled over a particular
- * time interval. The count is provided via
- * state dump and is used as a measure against
- * least priority op throttling.
- */
- gettimeofday(&curtv, NULL);
- timersub(&curtv, &conf->throttle.sample_time,
- &difftv);
- if (difftv.tv_sec >= IOT_LEAST_THROTTLE_DELAY) {
- conf->throttle.cached_rate =
- conf->throttle.sample_cnt;
- conf->throttle.sample_cnt = 0;
- conf->throttle.sample_time = curtv;
- }
-
- /*
- * If we're over the configured rate limit,
- * provide an absolute time to the caller that
- * represents the soonest we're allowed to
- * return another least priority request.
- */
- if (conf->throttle.rate_limit &&
- conf->throttle.sample_cnt >=
- conf->throttle.rate_limit) {
- struct timeval delay;
- delay.tv_sec = IOT_LEAST_THROTTLE_DELAY;
- delay.tv_usec = 0;
-
- timeradd(&conf->throttle.sample_time,
- &delay, &curtv);
- TIMEVAL_TO_TIMESPEC(&curtv, sleep);
-
- pthread_mutex_unlock(
- &conf->throttle.lock);
- break;
- }
- }
- conf->throttle.sample_cnt++;
- pthread_mutex_unlock(&conf->throttle.lock);
- }
-
- stub = list_entry (conf->reqs[i].next, call_stub_t, list);
+ if (conf->ns_weighted_queueing) {
+ /* Get the clock for this priority level, and keep track
+ * of where the search started, so we know if we've
+ * searched through the whole clock. */
+ curr_clock = &conf->ns_clocks[i];
+ start_idx = curr_clock->idx;
+
+ do {
+ /* Get the queue for the current index (modulo
+ * size), and increment the clock forward. */
+ queue = curr_clock->slots[curr_clock->idx];
+ curr_clock->idx = (curr_clock->idx + 1) % curr_clock->size;
+
+ /* If we have a request to serve, then we're done. */
+ if (!__iot_ns_queue_empty (queue)) {
+ break;
+ }
+
+ /* Otherwise, keep searching until we've looped
+ * back to the start. */
+ queue = NULL;
+ } while (curr_clock->idx != start_idx);
+
+ /* If we have no queue, we must've not found a req to
+ * serve. Let's try the next priority. */
+ if (!queue) {
+ continue;
+ }
+ } else {
+ /* If our unknown queue is empty, we have no other
+ * queues to serve. */
+ if (__iot_ns_queue_empty (&conf->ns_unknown_queue[i])) {
+ continue;
+ }
+
+ /* Select the unknown queue as the next queue to
+ * serve a request from. */
+ queue = &conf->ns_unknown_queue[i];
+ }
+
+ /* Otherwise take a request off of the queue. */
+ stub = list_first_entry (&queue->reqs, call_stub_t, list);
+ list_del_init (&stub->list);
+
+ /* Increment the number of workers serving this priority,
+ * and record which priority we are serving. Update queue
+ * sizes and set `pri` variable for the caller. */
conf->ac_iot_count[i]++;
conf->queue_marked[i] = _gf_false;
+
+ conf->queue_size--;
+ conf->queue_sizes[i]--;
+ queue->size--;
+
*pri = i;
break;
}
- if (!stub)
- return NULL;
-
- conf->queue_size--;
- conf->queue_sizes[*pri]--;
- list_del_init (&stub->list);
-
return stub;
}
@@ -135,15 +680,29 @@ __iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep)
void
__iot_enqueue (iot_conf_t *conf, call_stub_t *stub, int pri)
{
- if (pri < 0 || pri >= IOT_PRI_MAX)
- pri = IOT_PRI_MAX-1;
+ ns_info_t *info = &stub->frame->root->ns_info;
+ iot_ns_queue_t *queue = 0;
+
+ /* If we have an invalid priority, set it to LEAST. */
+ if (pri < 0 || pri >= IOT_PRI_MAX) {
+ pri = IOT_PRI_MAX - 1;
+ }
- list_add_tail (&stub->list, &conf->reqs[pri]);
+ /* Get the queue for this namespace. If we don't have one,
+ * use the unknown queue that always exists in the conf struct. */
+ queue = __iot_get_ns_queue (conf, info);
+ if (!queue) {
+ queue = conf->ns_unknown_queue;
+ }
+
+ /* Get the (pri)'th level queue, and add the request to the queue. */
+ queue = &queue[pri];
+ list_add_tail (&stub->list, &queue->reqs);
+ /* Update size records. */
conf->queue_size++;
conf->queue_sizes[pri]++;
-
- return;
+ queue->size++;
}
@@ -156,28 +715,24 @@ iot_worker (void *data)
struct timespec sleep_till = {0, };
int ret = 0;
int pri = -1;
- struct timespec sleep = {0,};
- gf_boolean_t bye = _gf_false;
+ char timeout = 0;
+ char bye = 0;
conf = data;
this = conf->this;
THIS = this;
+ gf_log (GF_IO_THREADS, GF_LOG_DEBUG, "IOT worker spawned.");
- for (;;) {
+ while (_gf_true) {
pthread_mutex_lock (&conf->mutex);
{
if (pri != -1) {
conf->ac_iot_count[pri]--;
pri = -1;
}
- while (conf->queue_size == 0) {
- if (conf->down) {
- bye = _gf_true;/*Avoid sleep*/
- break;
- }
- clock_gettime (CLOCK_REALTIME_COARSE,
- &sleep_till);
+ while (conf->queue_size == 0) {
+ clock_gettime (CLOCK_REALTIME_COARSE, &sleep_till);
sleep_till.tv_sec += conf->idle_time;
conf->sleep_count++;
@@ -186,55 +741,52 @@ iot_worker (void *data)
&sleep_till);
conf->sleep_count--;
- if (conf->down || ret == ETIMEDOUT) {
- bye = _gf_true;
+ if (ret == ETIMEDOUT) {
+ timeout = 1;
break;
}
}
- if (bye) {
- if (conf->down ||
- conf->curr_count > IOT_MIN_THREADS) {
+ if (timeout) {
+ if (conf->curr_count > IOT_MIN_THREADS) {
conf->curr_count--;
- if (conf->curr_count == 0)
- pthread_cond_broadcast (&conf->cond);
- gf_msg_debug (conf->this->name, 0,
- "terminated. "
- "conf->curr_count=%d",
- conf->curr_count);
+ bye = 1;
+ gf_log (conf->this->name, GF_LOG_DEBUG,
+ "timeout, terminated. conf->curr_count=%d",
+ conf->curr_count);
} else {
- bye = _gf_false;
+ timeout = 0;
}
}
- if (!bye) {
- stub = __iot_dequeue (conf, &pri, &sleep);
- if (!stub && (sleep.tv_sec || sleep.tv_nsec)) {
- pthread_cond_timedwait(&conf->cond,
- &conf->mutex,
- &sleep);
- pthread_mutex_unlock(&conf->mutex);
- continue;
- }
- }
+ stub = __iot_dequeue (conf, &pri);
}
pthread_mutex_unlock (&conf->mutex);
if (stub) { /* guard against spurious wakeups */
if (stub->poison) {
gf_log (this->name, GF_LOG_INFO,
- "Dropping poisoned request %p.", stub);
+ "dropping poisoned request %p", stub);
call_stub_destroy (stub);
} else {
call_resume (stub);
}
}
- stub = NULL;
- if (bye)
+ if (bye) {
break;
+ }
+ }
+
+ if (pri != -1) {
+ pthread_mutex_lock (&conf->mutex);
+ {
+ conf->ac_iot_count[pri]--;
+ }
+ pthread_mutex_unlock (&conf->mutex);
}
+ gf_log (GF_IO_THREADS, GF_LOG_DEBUG, "IOT worker died.");
return NULL;
}
@@ -273,6 +825,9 @@ iot_get_pri_meaning (iot_pri_t pri)
{
char *name = NULL;
switch (pri) {
+ case IOT_PRI_UNSPEC:
+ name = "unspecified";
+ break;
case IOT_PRI_HI:
name = "fast";
break;
@@ -288,13 +843,11 @@ iot_get_pri_meaning (iot_pri_t pri)
case IOT_PRI_MAX:
name = "invalid";
break;
- case IOT_PRI_UNSPEC:
- name = "unspecified";
- break;
}
return name;
}
+
int
iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
{
@@ -307,75 +860,17 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
goto out;
}
- switch (stub->fop) {
- case GF_FOP_OPEN:
- case GF_FOP_STAT:
- case GF_FOP_FSTAT:
- case GF_FOP_LOOKUP:
- case GF_FOP_ACCESS:
- case GF_FOP_READLINK:
- case GF_FOP_OPENDIR:
- case GF_FOP_STATFS:
- case GF_FOP_READDIR:
- case GF_FOP_READDIRP:
- case GF_FOP_GETACTIVELK:
- case GF_FOP_SETACTIVELK:
- pri = IOT_PRI_HI;
- break;
-
- case GF_FOP_CREATE:
- case GF_FOP_FLUSH:
- case GF_FOP_LK:
- case GF_FOP_INODELK:
- case GF_FOP_FINODELK:
- case GF_FOP_ENTRYLK:
- case GF_FOP_FENTRYLK:
- case GF_FOP_LEASE:
- case GF_FOP_UNLINK:
- case GF_FOP_SETATTR:
- case GF_FOP_FSETATTR:
- case GF_FOP_MKNOD:
- case GF_FOP_MKDIR:
- case GF_FOP_RMDIR:
- case GF_FOP_SYMLINK:
- case GF_FOP_RENAME:
- case GF_FOP_LINK:
- case GF_FOP_SETXATTR:
- case GF_FOP_GETXATTR:
- case GF_FOP_FGETXATTR:
- case GF_FOP_FSETXATTR:
- case GF_FOP_REMOVEXATTR:
- case GF_FOP_FREMOVEXATTR:
- pri = IOT_PRI_NORMAL;
- break;
+ if (frame->pri != IOT_PRI_UNSPEC) {
+ pri = frame->pri;
+ goto out;
+ }
- case GF_FOP_READ:
- case GF_FOP_WRITE:
- case GF_FOP_FSYNC:
- case GF_FOP_TRUNCATE:
- case GF_FOP_FTRUNCATE:
- case GF_FOP_FSYNCDIR:
- case GF_FOP_XATTROP:
- case GF_FOP_FXATTROP:
- case GF_FOP_RCHECKSUM:
- case GF_FOP_FALLOCATE:
- case GF_FOP_DISCARD:
- case GF_FOP_ZEROFILL:
- pri = IOT_PRI_LO;
- break;
+ // Retrieve the fop priority
+ pri = iot_fop_to_pri (stub->fop);
- case GF_FOP_FORGET:
- case GF_FOP_RELEASE:
- case GF_FOP_RELEASEDIR:
- case GF_FOP_GETSPEC:
- break;
- case GF_FOP_IPC:
- default:
- return -EINVAL;
- }
out:
- gf_msg_debug (this->name, 0, "%s scheduled as %s fop",
- gf_fop_list[stub->fop], iot_get_pri_meaning (pri));
+ gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop",
+ gf_fop_list[stub->fop], iot_get_pri_meaning (pri));
ret = do_iot_schedule (this->private, stub, pri);
return ret;
}
@@ -619,34 +1114,115 @@ iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
}
+/* Populate all queue size keys for a specific priority level. */
int
-iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
+__iot_populate_ns_queue_sizes (iot_conf_t *conf, dict_t *depths, int pri)
+{
+ int ret = 0;
+ data_pair_t *curr = NULL;
+ iot_ns_queue_t *curr_queue = NULL;
+ char *temp_key = NULL;
+ char *ns_name = NULL;
+ const char *pri_str = fop_pri_to_string (pri);;
+
+ /* For each namespace at this priority level, we try to grab the n
+ * namespace name and record a key like `namespaces.NAME.PRI_LEVEL`. */
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[pri];
+
+ /* Try to retrieve the namespace's un-hashed real name. */
+ ret = dict_get_str (conf->hash_to_ns, curr->key, &ns_name);
+ if (ret || !ns_name) {
+ continue;
+ }
+
+ /* Give the root namespace a readable name. */
+ if (strncmp (ns_name, "/", 1) == 0) {
+ ns_name = "root";
+ }
+
+ /* Print a new temporary key for the namespace and priority level. */
+ ret = gf_asprintf (&temp_key, "namespaces.%s.%s", ns_name, pri_str);
+ if (ret == -1 || !temp_key) {
+ ret = -(ENOMEM);
+ goto out;
+ }
+
+ /* Insert the key and queue-size. */
+ ret = dict_set_int32 (depths, temp_key, curr_queue->size);
+ GF_FREE (temp_key);
+ temp_key = NULL;
+
+ if (ret) {
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/* Populate global queue counts (per-priority) and if namespace queueing is
+ * enabled, then also add those keys to the dictionary as well. */
+int
+__iot_populate_queue_sizes (iot_conf_t *conf, dict_t **depths)
{
- iot_conf_t *conf = NULL;
- dict_t *depths = NULL;
- int i = 0;
+ int ret = 0, pri = 0;
+ const char *pri_str = NULL;
- conf = this->private;
+ /* We explicitly do not want a reference count for this dict
+ * in this translator, since it will get freed in io_stats later. */
+ *depths = get_new_dict ();
+ if (!*depths) {
+ return -(ENOMEM);
+ }
- if (conf && name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) {
- // We explicitly do not want a reference count
- // for this dict in this translator
- depths = get_new_dict ();
- if (!depths)
- goto unwind_special_getxattr;
+ for (pri = 0; pri < IOT_PRI_MAX; pri++) {
+ pri_str = fop_pri_to_string (pri);
- for (i = 0; i < IOT_PRI_MAX; i++) {
- if (dict_set_int32 (depths,
- (char *)fop_pri_to_string (i),
- conf->queue_sizes[i]) != 0) {
- dict_destroy (depths);
- depths = NULL;
- goto unwind_special_getxattr;
+ /* First, let's add an entry for the number of requests
+ * per prority (globally). */
+ ret = dict_set_int32 (*depths, (char *)pri_str,
+ conf->queue_sizes[pri]);
+ if (ret) {
+ goto out;
+ }
+
+ /* If namespace queueing is enabled, then try to populate
+ * per-namespace queue keys as well. */
+ if (conf->ns_weighted_queueing) {
+ ret = __iot_populate_ns_queue_sizes (conf, *depths, pri);
+ if (ret) {
+ goto out;
}
}
+ }
+
+out:
+ if (ret) {
+ dict_destroy (*depths);
+ *depths = NULL;
+ }
+
+ return ret;
+}
+
+int
+iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ iot_conf_t *conf = this->private;
+ dict_t *depths = NULL;
+
+ if (name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) {
+ /* Populate depths dict, or it'll stay NULL if an error occurred. */
+ pthread_mutex_lock (&conf->mutex);
+ {
+ __iot_populate_queue_sizes (conf, &depths);
+ }
+ pthread_mutex_unlock (&conf->mutex);
-unwind_special_getxattr:
STACK_UNWIND_STRICT (getxattr, frame, 0, 0, depths, xdata);
return 0;
}
@@ -824,10 +1400,9 @@ __iot_workers_scale (iot_conf_t *conf)
ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf);
if (ret == 0) {
conf->curr_count++;
- gf_msg_debug (conf->this->name, 0,
- "scaled threads to %d (queue_size=%d/%d)",
- conf->curr_count,
- conf->queue_size, scale);
+ gf_log (conf->this->name, GF_LOG_DEBUG,
+ "scaled threads to %d (queue_size=%d/%d)",
+ conf->curr_count, conf->queue_size, scale);
} else {
break;
}
@@ -872,13 +1447,11 @@ set_stack_size (iot_conf_t *conf)
if (err == EINVAL) {
err = pthread_attr_getstacksize (&conf->w_attr, &stacksize);
if (!err)
- gf_msg (this->name, GF_LOG_WARNING,
- 0, IO_THREADS_MSG_SIZE_NOT_SET,
+ gf_log (this->name, GF_LOG_WARNING,
"Using default thread stack size %zd",
stacksize);
else
- gf_msg (this->name, GF_LOG_WARNING,
- 0, IO_THREADS_MSG_SIZE_NOT_SET,
+ gf_log (this->name, GF_LOG_WARNING,
"Using default thread stack size");
}
@@ -897,9 +1470,8 @@ mem_acct_init (xlator_t *this)
ret = xlator_mem_acct_init (this, gf_iot_mt_end + 1);
if (ret != 0) {
- gf_msg (this->name, GF_LOG_ERROR,
- ENOMEM, IO_THREADS_MSG_NO_MEMORY,
- "Memory accounting init failed");
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
return ret;
}
@@ -938,10 +1510,6 @@ iot_priv_dump (xlator_t *this)
gf_proc_dump_write("least_priority_threads", "%d",
conf->ac_iot_limit[IOT_PRI_LEAST]);
- gf_proc_dump_write("cached least rate", "%u",
- conf->throttle.cached_rate);
- gf_proc_dump_write("least rate limit", "%u", conf->throttle.rate_limit);
-
return 0;
}
@@ -1107,13 +1675,16 @@ stop_iot_watchdog (xlator_t *this)
int
reconfigure (xlator_t *this, dict_t *options)
{
- iot_conf_t *conf = NULL;
- int ret = -1;
+ iot_conf_t *conf = NULL;
+ int i, ret = -1;
+ gf_boolean_t ns_weighted_queueing = _gf_false;
conf = this->private;
if (!conf)
goto out;
+ GF_OPTION_RECONF ("iam-brick-daemon", conf->iambrickd, options, bool, out);
+
GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out);
GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
@@ -1132,25 +1703,52 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("least-prio-threads",
conf->ac_iot_limit[IOT_PRI_LEAST], options, int32,
out);
+
GF_OPTION_RECONF ("enable-least-priority", conf->least_priority,
options, bool, out);
- GF_OPTION_RECONF ("least-rate-limit", conf->throttle.rate_limit,
- options, int32, out);
-
GF_OPTION_RECONF ("cleanup-disconnected-reqs",
conf->cleanup_disconnected_reqs, options, bool, out);
GF_OPTION_RECONF ("watchdog-secs", conf->watchdog_secs, options,
int32, out);
-
if (conf->watchdog_secs > 0) {
start_iot_watchdog (this);
} else {
stop_iot_watchdog (this);
}
- ret = 0;
+ GF_OPTION_RECONF ("ns-weighted-queueing", ns_weighted_queueing, options, bool, out);
+ GF_OPTION_RECONF ("ns-default-weight", conf->ns_default_weight, options, uint32, out);
+ GF_OPTION_RECONF ("ns-weight-tolerance", conf->ns_weight_tolerance, options, double, out);
+ GF_OPTION_RECONF ("ns-conf-reinit-secs", conf->ns_conf_reinit_secs, options,
+ uint32, out);
+
+ if (!conf->iambrickd) {
+ ns_weighted_queueing = _gf_false;
+ }
+
+ /* Reinit the default weight, which is the weight of the unknown queue. */
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ conf->ns_unknown_queue[i].weight = conf->ns_default_weight;
+ }
+
+ if (ns_weighted_queueing != conf->ns_weighted_queueing) {
+ pthread_mutex_lock (&conf->mutex);
+ {
+ conf->ns_weighted_queueing = ns_weighted_queueing;
+ __iot_reinit_ns_conf (conf);
+ }
+ pthread_mutex_unlock (&conf->mutex);
+ }
+
+ if (conf->ns_weighted_queueing) {
+ start_iot_reinit_ns_conf_thread (this);
+ } else {
+ stop_iot_reinit_ns_conf_thread (this);
+ }
+
+ ret = 0;
out:
return ret;
}
@@ -1160,49 +1758,43 @@ int
init (xlator_t *this)
{
iot_conf_t *conf = NULL;
- int ret = -1;
- int i = 0;
+ int i, ret = -1;
if (!this->children || this->children->next) {
- gf_msg ("io-threads", GF_LOG_ERROR, 0,
- IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED,
- "FATAL: iot not configured "
- "with exactly one child");
+ gf_log ("io-threads", GF_LOG_ERROR,
+ "FATAL: iot not configured with exactly one child");
goto out;
}
if (!this->parents) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- IO_THREADS_MSG_VOL_MISCONFIGURED,
- "dangling volume. check volfile ");
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
}
conf = (void *) GF_CALLOC (1, sizeof (*conf),
gf_iot_mt_iot_conf_t);
if (conf == NULL) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- IO_THREADS_MSG_NO_MEMORY, "out of memory");
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory");
goto out;
}
- if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- IO_THREADS_MSG_INIT_FAILED,
+ if ((ret = pthread_cond_init (&conf->cond, NULL)) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
"pthread_cond_init failed (%d)", ret);
goto out;
}
- conf->cond_inited = _gf_true;
- if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- IO_THREADS_MSG_INIT_FAILED,
+ if ((ret = pthread_mutex_init (&conf->mutex, NULL)) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
"pthread_mutex_init failed (%d)", ret);
goto out;
}
- conf->mutex_inited = _gf_true;
set_stack_size (conf);
+ GF_OPTION_INIT ("iam-brick-daemon", conf->iambrickd, bool, out);
+
GF_OPTION_INIT ("thread-count", conf->max_count, int32, out);
GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
@@ -1228,33 +1820,45 @@ init (xlator_t *this)
GF_OPTION_INIT ("cleanup-disconnected-reqs",
conf->cleanup_disconnected_reqs, bool, out);
- GF_OPTION_INIT ("least-rate-limit", conf->throttle.rate_limit, int32,
- out);
+ conf->this = this;
+
+ GF_OPTION_INIT ("ns-weighted-queueing", conf->ns_weighted_queueing, bool, out);
+ GF_OPTION_INIT ("ns-default-weight", conf->ns_default_weight, uint32, out);
+ GF_OPTION_INIT ("ns-weight-tolerance", conf->ns_weight_tolerance, double, out);
+ GF_OPTION_INIT ("ns-conf-reinit-secs", conf->ns_conf_reinit_secs, uint32, out);
- if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- IO_THREADS_MSG_INIT_FAILED,
- "pthread_mutex_init failed (%d)", ret);
- goto out;
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ INIT_LIST_HEAD (&conf->ns_unknown_queue[i].reqs);
+ conf->ns_unknown_queue[i].hash = 0;
+ conf->ns_unknown_queue[i].weight = conf->ns_default_weight;
+ conf->ns_unknown_queue[i].size = 0;
}
- conf->this = this;
+ if (!conf->iambrickd) {
+ conf->ns_weighted_queueing = _gf_false;
+ }
- for (i = 0; i < IOT_PRI_MAX; i++) {
- INIT_LIST_HEAD (&conf->reqs[i]);
+ conf->hash_to_ns = dict_new ();
+
+ pthread_mutex_lock (&conf->mutex);
+ {
+ __iot_reinit_ns_conf (conf);
}
+ pthread_mutex_unlock (&conf->mutex);
ret = iot_workers_scale (conf);
-
if (ret == -1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- IO_THREADS_MSG_INIT_FAILED,
+ gf_log (this->name, GF_LOG_ERROR,
"cannot initialize worker threads, exiting init");
goto out;
}
this->private = conf;
+ if (conf->ns_weighted_queueing) {
+ start_iot_reinit_ns_conf_thread (this);
+ }
+
conf->watchdog_secs = 0;
GF_OPTION_INIT ("watchdog-secs", conf->watchdog_secs, int32, out);
if (conf->watchdog_secs > 0) {
@@ -1263,57 +1867,24 @@ init (xlator_t *this)
ret = 0;
out:
- if (ret)
+ if (ret) {
GF_FREE (conf);
+ }
return ret;
}
-static void
-iot_exit_threads (iot_conf_t *conf)
-{
- pthread_mutex_lock (&conf->mutex);
- {
- conf->down = _gf_true;
- /*Let all the threads know that xl is going down*/
- pthread_cond_broadcast (&conf->cond);
- while (conf->curr_count)/*Wait for threads to exit*/
- pthread_cond_wait (&conf->cond, &conf->mutex);
- }
- pthread_mutex_unlock (&conf->mutex);
-}
-
-int
-notify (xlator_t *this, int32_t event, void *data, ...)
-{
- iot_conf_t *conf = this->private;
-
- if (GF_EVENT_PARENT_DOWN == event)
- iot_exit_threads (conf);
-
- default_notify (this, event, data);
-
- return 0;
-}
void
fini (xlator_t *this)
{
iot_conf_t *conf = this->private;
- if (!conf)
- return;
-
- if (conf->mutex_inited && conf->cond_inited)
- iot_exit_threads (conf);
-
- if (conf->cond_inited)
- pthread_cond_destroy (&conf->cond);
-
- if (conf->mutex_inited)
- pthread_mutex_destroy (&conf->mutex);
-
stop_iot_watchdog (this);
+ stop_iot_reinit_ns_conf_thread (this);
+
+ dict_unref (conf->hash_to_ns);
+ conf->hash_to_ns = NULL;
GF_FREE (conf);
@@ -1321,13 +1892,32 @@ fini (xlator_t *this)
return;
}
+/* Clears a queue of requests from the given client (which is assumed to
+ * have disconnected or otherwise stopped needing these requests...) */
+void
+clear_reqs_from_queue (xlator_t *this, client_t *client, struct list_head *queue)
+{
+ call_stub_t *curr;
+ call_stub_t *next;
+
+ list_for_each_entry_safe (curr, next, queue, list) {
+ if (curr->frame->root->client != client) {
+ continue;
+ }
+ gf_log (this->name, GF_LOG_INFO,
+ "poisoning %s fop at %p for client %s",
+ gf_fop_list[curr->fop], curr, client->client_uid);
+ curr->poison = _gf_true;
+ }
+}
+
static int
iot_disconnect_cbk (xlator_t *this, client_t *client)
{
+ iot_conf_t *conf = this->private;
+ data_pair_t *curr = NULL;
+ iot_ns_queue_t *curr_queue = NULL;
int i;
- call_stub_t *curr;
- call_stub_t *next;
- iot_conf_t *conf = this->private;
if (!conf || !conf->cleanup_disconnected_reqs) {
goto out;
@@ -1335,15 +1925,15 @@ iot_disconnect_cbk (xlator_t *this, client_t *client)
pthread_mutex_lock (&conf->mutex);
for (i = 0; i < IOT_PRI_MAX; i++) {
- list_for_each_entry_safe (curr, next, &conf->reqs[i], list) {
- if (curr->frame->root->client != client) {
- continue;
+ /* Clear client reqs from the unknown queue. */
+ clear_reqs_from_queue (this, client, &conf->ns_unknown_queue[i].reqs);
+ /* Clear client reqs from each of the namespace queues. */
+ if (conf->ns_weighted_queueing && conf->ns_queues) {
+ dict_foreach_inline (conf->ns_queues, curr) {
+ curr_queue = data_to_ptr (curr->value);
+ curr_queue = &curr_queue[i];
+ clear_reqs_from_queue (this, client, &curr_queue->reqs);
}
- gf_log (this->name, GF_LOG_INFO,
- "poisoning %s fop at %p for client %s",
- gf_fop_list[curr->fop], curr,
- client->client_uid);
- curr->poison = _gf_true;
}
}
pthread_mutex_unlock (&conf->mutex);
@@ -1412,7 +2002,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
.max = IOT_MAX_THREADS,
- .default_value = "16",
+ .default_value = "32",
.description = "Number of threads in IO threads translator which "
"perform concurrent IO operations"
@@ -1435,7 +2025,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
.max = IOT_MAX_THREADS,
- .default_value = "16",
+ .default_value = "32",
.description = "Max number of threads in IO threads translator which "
"perform high priority IO operations at a given time"
@@ -1444,7 +2034,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
.max = IOT_MAX_THREADS,
- .default_value = "16",
+ .default_value = "32",
.description = "Max number of threads in IO threads translator which "
"perform normal priority IO operations at a given time"
@@ -1453,7 +2043,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
.max = IOT_MAX_THREADS,
- .default_value = "16",
+ .default_value = "32",
.description = "Max number of threads in IO threads translator which "
"perform low priority IO operations at a given time"
@@ -1477,14 +2067,6 @@ struct volume_options options[] = {
.max = 0x7fffffff,
.default_value = "120",
},
- { .key = {"least-rate-limit"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0,
- .max = INT_MAX,
- .default_value = "0",
- .description = "Max number of least priority operations to handle "
- "per-second"
- },
{ .key = {"watchdog-secs"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
@@ -1495,7 +2077,47 @@ struct volume_options options[] = {
{ .key = {"cleanup-disconnected-reqs"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "'Poison' queued requests when a client disconnects"
+ .description = "Enable/Disable least priority"
+ },
+ { .key = {"ns-weighted-queueing"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enables the namespace queues clock."
+ },
+ { .key = {"ns-default-weight"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 0x7fffffff,
+ .default_value = "100",
+ .description = "The default weight of a queue which doesn't have a "
+ "weight specified in the namespace conf. This is also "
+ "the weight of the unknown (default) queue."
+ },
+ { .key = {"ns-weight-tolerance"},
+ .type = GF_OPTION_TYPE_DOUBLE,
+ .default_value = "0.5",
+ .description = "The tolerance in percentage (out of 100) that the "
+ "slot-allocation algorithm will tolerate for weight/total "
+ "and slots/total percentages. This corresponds to "
+ "ideal and realized workload percentages allocated "
+ "to the namespace."
+ },
+ { .key = {"ns-conf-reinit-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 0x7fffffff,
+ .default_value = "5",
+ .description = "Number of seconds that the ns conf reinit thread "
+ "sleeps before trying to detect changes in the "
+ "namespace config file."
+ },
+ {
+ .key = {"iam-brick-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the io-stats "
+ "translator is running as part of brick daemon "
+ "or not."
},
{ .key = {NULL},
},
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index 4300cf673b2..5c3403a7153 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -11,6 +11,11 @@
#ifndef __IOT_H
#define __IOT_H
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
#include "compat-errno.h"
#include "glusterfs.h"
@@ -25,6 +30,8 @@
#include <semaphore.h>
#include "statedump.h"
+#define GF_IO_THREADS "io-threads"
+const char *_IOT_NAMESPACE_CONF = TOSTRING (GLUSTERD_WORKDIR) "/namespaces.conf";
struct iot_conf;
@@ -38,23 +45,33 @@ struct iot_conf;
#define IOT_MIN_FOP_PER_THREAD 0
#define IOT_MAX_FOP_PER_THREAD 2000
-
-#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024))
-
-
-#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */
-struct iot_least_throttle {
- struct timeval sample_time; /* timestamp of current sample */
- uint32_t sample_cnt; /* sample count for active interval */
- uint32_t cached_rate; /* the most recently measured rate */
- int32_t rate_limit; /* user-specified rate limit */
- pthread_mutex_t lock;
-};
+/* A queue (well, typically a set of IOT_PRI_MAX queues) that corresponds to
+ * a namespace and a priority level. */
+typedef struct _iot_ns_queue {
+ uint32_t hash; /* Hash of namespace queue corresponds to. */
+ uint32_t weight; /* Weight this queue should have in the clock. */
+ double percentage; /* Percentage of total weight given to this queue. */
+ uint32_t slots; /* Temp variable for number of slots to allocate to this queue. */
+ uint32_t size; /* Number of reqs in the queue currently. */
+ struct list_head reqs; /* Queue of fop call-stubs (requests) */
+} iot_ns_queue_t;
+
+/* A circular buffer which points to multiple queues which approximate a
+ * weighted round robin serving requests. */
+typedef struct _iot_ns_clock {
+ unsigned int idx; /* Current idx of position on clock. */
+ size_t size; /* Size of clock. */
+ iot_ns_queue_t **slots; /* Circular buffer of clock slots. */
+} iot_ns_clock_t;
+
+#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024*8))
struct iot_conf {
pthread_mutex_t mutex;
pthread_cond_t cond;
+ gf_boolean_t iambrickd;
+
int32_t max_count; /* configured maximum */
int32_t fops_per_thread_ratio;
int32_t curr_count; /* actual number of threads running */
@@ -62,30 +79,39 @@ struct iot_conf {
int32_t idle_time; /* in seconds */
- struct list_head reqs[IOT_PRI_MAX];
+ gf_boolean_t ns_weighted_queueing;
+ uint32_t ns_default_weight;
+ double ns_weight_tolerance;
+ uint32_t ns_conf_reinit_secs;
+ time_t ns_conf_mtime;
+ dict_t *ns_queues; /* Queue for requsts for namespaces that were parsed. */
+ dict_t *hash_to_ns; /* Hash key (string) -> Namespace string */
+ iot_ns_clock_t ns_clocks[IOT_PRI_MAX]; /* Weighted Round Robin clocks (per priority). */
+ iot_ns_queue_t ns_unknown_queue[IOT_PRI_MAX]; /* Queue for untagged requests (per priority). */
+
+ gf_boolean_t reinit_ns_conf_thread_running;
+ pthread_t reinit_ns_conf_thread;
int32_t ac_iot_limit[IOT_PRI_MAX];
int32_t ac_iot_count[IOT_PRI_MAX];
int queue_sizes[IOT_PRI_MAX];
int queue_size;
pthread_attr_t w_attr;
- gf_boolean_t least_priority; /*Enable/Disable least-priority */
+ gf_boolean_t least_priority; /* Enable/Disable least-priority */
- xlator_t *this;
+ xlator_t *this;
size_t stack_size;
- gf_boolean_t down; /*PARENT_DOWN event is notified*/
- gf_boolean_t mutex_inited;
- gf_boolean_t cond_inited;
- struct iot_least_throttle throttle;
-
int32_t watchdog_secs;
gf_boolean_t watchdog_running;
pthread_t watchdog_thread;
gf_boolean_t queue_marked[IOT_PRI_MAX];
+
gf_boolean_t cleanup_disconnected_reqs;
};
typedef struct iot_conf iot_conf_t;
+iot_pri_t iot_fop_to_pri (glusterfs_fop_t fop);
+
#endif /* __IOT_H */