summaryrefslogtreecommitdiffstats
path: root/xlators/debug/io-stats/src/io-stats.c
diff options
context:
space:
mode:
authorShreyas Siravara <sshreyas@fb.com>2017-09-03 09:18:10 -0700
committerShreyas Siravara <sshreyas@fb.com>2017-09-03 17:28:55 +0000
commit87f6e9f034fb6560161f419b5b3e22631f802ace (patch)
treeabdf9a61f21957833b411b4bc6607c88fc2ffddc /xlators/debug/io-stats/src/io-stats.c
parent51f797ce900527d8441269f3ebdc5456cfc299e0 (diff)
auditing: Sample creation & removal of filesystem entries as well as errors
Summary: - Adds the ability for gluster to log every single CREATE and UNLINK that happens on the bricks (right before invoking sys_unlink() or open(...| O_CREAT) - Makes it so that CREATEs and UNLINKs are not downsampled in io-stats - This is a port of D3268156, D3778968, D3903894 & D3301527 to 3.8 Reviewed By: kvigor Change-Id: I1bce28068c02b7d202f094094237646b4d39794b Reviewed-on: https://review.gluster.org/18198 Reviewed-by: Shreyas Siravara <sshreyas@fb.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/debug/io-stats/src/io-stats.c')
-rw-r--r--xlators/debug/io-stats/src/io-stats.c237
1 files changed, 235 insertions, 2 deletions
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index d25d5be29ea..101803f470d 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -42,6 +42,7 @@
#define MAX_LIST_MEMBERS 100
#define DEFAULT_PWD_BUF_SZ 16384
#define DEFAULT_GRP_BUF_SZ 16384
+#define IOS_MAX_ERRORS 132
typedef enum {
IOS_STATS_TYPE_NONE,
@@ -131,6 +132,154 @@ struct ios_global_stats {
struct timeval max_openfd_time;
};
+/* This is a list of errors which are in some way critical.
+ * It is useful to sample these errors even if other errors
+ * should be ignored. */
+const int32_t ios_hard_error_list[] = {
+ EIO,
+ EROFS,
+ ENOSPC,
+ ENOTCONN,
+ ESTALE,
+};
+
+#define IOS_HARD_ERROR_LIST_SIZE (sizeof(ios_hard_error_list) / sizeof(int32_t))
+
+const char *errno_to_name[IOS_MAX_ERRORS] = {
+ "success", /* 0 */
+ "eperm",
+ "enoent",
+ "esrch",
+ "eintr",
+ "eio",
+ "enxio",
+ "e2big",
+ "enoexec",
+ "ebadf",
+ "echild",
+ "eagain",
+ "enomem",
+ "eacces",
+ "efault",
+ "enotblk",
+ "ebusy",
+ "eexist",
+ "exdev",
+ "enodev",
+ "enotdir",
+ "eisdir",
+ "einval",
+ "enfile",
+ "emfile",
+ "enotty",
+ "etxtbsy",
+ "efbig",
+ "enospc",
+ "espipe",
+ "erofs",
+ "emlink",
+ "epipe",
+ "edom",
+ "erange",
+ "edeadlk",
+ "enametoolong",
+ "enolck",
+ "enosys",
+ "enotempty",
+ "eloop",
+ "ewouldblock",
+ "enomsg",
+ "eidrm",
+ "echrng",
+ "el2nsync",
+ "el3hlt",
+ "el3rst",
+ "elnrng",
+ "eunatch",
+ "enocsi",
+ "el2hlt",
+ "ebade",
+ "ebadr",
+ "exfull",
+ "enoano",
+ "ebadrqc",
+ "ebadslt",
+ "edeadlock",
+ "ebfont",
+ "enostr",
+ "enodata",
+ "etime",
+ "enosr",
+ "enonet",
+ "enopkg",
+ "eremote",
+ "enolink",
+ "eadv",
+ "esrmnt",
+ "ecomm",
+ "eproto",
+ "emultihop",
+ "edotdot",
+ "ebadmsg",
+ "eoverflow",
+ "enotuniq",
+ "ebadfd",
+ "eremchg",
+ "elibacc",
+ "elibbad",
+ "elibscn",
+ "elibmax",
+ "elibexec",
+ "eilseq",
+ "erestart",
+ "estrpipe",
+ "eusers",
+ "enotsock",
+ "edestaddrreq",
+ "emsgsize",
+ "eprototype",
+ "enoprotoopt",
+ "eprotonosupport",
+ "esocktnosupport",
+ "eopnotsupp",
+ "epfnosupport",
+ "eafnosupport",
+ "eaddrinuse",
+ "eaddrnotavail",
+ "enetdown",
+ "enetunreach",
+ "enetreset",
+ "econnaborted",
+ "econnreset",
+ "enobufs",
+ "eisconn",
+ "enotconn",
+ "eshutdown",
+ "etoomanyrefs",
+ "etimedout",
+ "econnrefused",
+ "ehostdown",
+ "ehostunreach",
+ "ealready",
+ "einprogress",
+ "estale",
+ "euclean",
+ "enotnam",
+ "enavail",
+ "eisnam",
+ "eremoteio",
+ "edquot",
+ "enomedium",
+ "emediumtype",
+ "ecanceled",
+ "enokey",
+ "ekeyexpired",
+ "ekeyrevoked",
+ "ekeyrejected",
+ "eownerdead",
+ "enotrecoverable"
+};
+
struct ios_conf {
gf_lock_t lock;
struct ios_global_stats cumulative;
@@ -153,6 +302,9 @@ struct ios_conf {
gf_boolean_t iamshd;
gf_boolean_t iamnfsd;
gf_boolean_t iamgfproxyd;
+ gf_boolean_t audit_creates_and_unlinks;
+ gf_boolean_t sample_hard_errors;
+ gf_boolean_t sample_all_errors;
};
@@ -1887,6 +2039,52 @@ out:
return;
}
+gf_boolean_t
+_should_sample (struct ios_conf *conf, glusterfs_fop_t fop_type,
+ ios_sample_buf_t* ios_sample_buf, int32_t op_ret,
+ int32_t op_errno)
+{
+ int i;
+
+ /* If sampling is disabled, return false */
+ if (conf->ios_sample_interval == 0)
+ return _gf_false;
+
+ /* Sometimes it's useful to sample errors. If `fop-sample-all-errors`
+ * is active, then we should sample ALL errors. */
+ if (op_ret < 0 && op_errno != 0 && conf->sample_all_errors) {
+ return _gf_true;
+ }
+
+ /* If `fop-sample-hard-errors` is active, we only look through a small
+ * subset of errno values to sample, those which are critical to Gluster
+ * functioning. */
+ if (op_ret < 0 && op_errno != 0 && conf->sample_hard_errors) {
+ for (i = 0; i < IOS_HARD_ERROR_LIST_SIZE; i++) {
+ if (abs (op_errno) == ios_hard_error_list[i]) {
+ return _gf_true;
+ }
+ }
+ }
+
+ /* If auditing is on, sample TRUNCATE, CREATE, UNLINK, RMDIR, MKDIR 1:1 */
+ if (conf->audit_creates_and_unlinks) {
+ switch (fop_type) {
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_CREATE:
+ case GF_FOP_UNLINK:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RMDIR:
+ return _gf_true;
+ default:
+ break;
+ }
+ }
+
+ /* Sample only 1 out of ios_sample_interval number of fops. */
+ return (ios_sample_buf->observed % conf->ios_sample_interval == 0);
+}
+
void collect_ios_latency_sample (struct ios_conf *conf,
glusterfs_fop_t fop_type, double elapsed,
call_frame_t *frame, int32_t op_ret, int32_t op_errno)
@@ -1901,9 +2099,9 @@ void collect_ios_latency_sample (struct ios_conf *conf,
ios_sample_buf = conf->ios_sample_buf;
LOCK (&conf->ios_sampling_lock);
- if (conf->ios_sample_interval == 0 ||
- ios_sample_buf->observed % conf->ios_sample_interval != 0)
+ if (!_should_sample (conf, fop_type, ios_sample_buf, op_ret, op_errno)) {
goto out;
+ }
timestamp = &frame->begin;
root = frame->root;
@@ -4022,6 +4220,15 @@ reconfigure (xlator_t *this, dict_t *options)
time, out);
gf_log_set_log_flush_timeout (log_flush_timeout);
+ GF_OPTION_RECONF ("fop-sample-enable-audit",
+ conf->audit_creates_and_unlinks, options, bool, out);
+
+ GF_OPTION_RECONF ("fop-sample-all-errors",
+ conf->sample_all_errors, options, bool, out);
+
+ GF_OPTION_RECONF ("fop-sample-hard-errors",
+ conf->sample_hard_errors, options, bool, out);
+
ret = 0;
out:
gf_log (this ? this->name : "io-stats",
@@ -4119,6 +4326,15 @@ init (xlator_t *this)
GF_OPTION_INIT ("iam-gfproxy-daemon", conf->iamgfproxyd, bool, out);
+ GF_OPTION_INIT ("fop-sample-hard-errors", conf->sample_hard_errors,
+ bool, out);
+
+ GF_OPTION_INIT ("fop-sample-all-errors", conf->sample_all_errors,
+ bool, out);
+
+ GF_OPTION_INIT ("fop-sample-enable-audit",
+ conf->audit_creates_and_unlinks, bool, out);
+
GF_OPTION_INIT ("dump-fd-stats", conf->dump_fd_stats, bool, out);
GF_OPTION_INIT ("count-fop-hits", conf->count_fop_hits, bool, out);
@@ -4559,6 +4775,23 @@ struct volume_options options[] = {
"translator is running as part of an GFProxy daemon "
"or not."
},
+ { .key = {"fop-sample-enable-audit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option samples the following FOPs 1:1: "
+ "CREATE, UNLINK, MKDIR, RMDIR, TRUNCATE. "
+ },
+ { .key = {"fop-sample-all-errors"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option samples all fops that failed."
+ },
+ { .key = {"fop-sample-hard-errors"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option samples all fops with \"hard errors\""
+ "including EROFS, ENOSPC, etc."
+ },
{ .key = {NULL} },
};