diff options
122 files changed, 6687 insertions, 514 deletions
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c index 8c9872cfa53..5d08114c8c5 100644 --- a/api/src/glfs-mgmt.c +++ b/api/src/glfs-mgmt.c @@ -911,7 +911,8 @@ glfs_mgmt_init (struct glfs *fs) if (!strcmp (cmd_args->volfile_server_transport, "unix")) { ret = rpc_transport_unix_options_build (&options, host, 0); } else { - ret = rpc_transport_inet_options_build (&options, host, port); + ret = rpc_transport_inet_options_build (&options, host, port, + NULL); } if (ret) diff --git a/cli/src/cli.c b/cli/src/cli.c index 2ecaae415d6..fa507309e80 100644 --- a/cli/src/cli.c +++ b/cli/src/cli.c @@ -586,6 +586,11 @@ cli_rpc_init (struct cli_state *state) int ret = -1; int port = CLI_GLUSTERD_PORT; xlator_t *this = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif this = THIS; cli_rpc_prog = &cli_prog; @@ -621,7 +626,8 @@ cli_rpc_init (struct cli_state *state) goto out; ret = dict_set_str (options, "transport.address-family", - "inet"); + addr_family); + if (ret) goto out; } diff --git a/configure.ac b/configure.ac index cfefa218156..86c6bcfcc4d 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,8 @@ AC_CONFIG_FILES([Makefile xlators/cluster/Makefile xlators/cluster/afr/Makefile xlators/cluster/afr/src/Makefile + xlators/cluster/aha/Makefile + xlators/cluster/aha/src/Makefile xlators/cluster/stripe/Makefile xlators/cluster/stripe/src/Makefile xlators/cluster/dht/Makefile @@ -275,7 +277,19 @@ if test "x$enable_debug" = "xyes"; then CFLAGS="${CFLAGS} -g -O0 -DDEBUG" else BUILD_DEBUG=no - CFLAGS="${CFLAGS} -g -O2" + CFLAGS="${CFLAGS} -g" +fi + +AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.])) +if test "x$with_fbextras" = "xyes"; then + BUILD_FBEXTRAS=yes +else + BUILD_FBEXTRAS=no +fi + +AC_ARG_ENABLE([privport_prefer], AC_HELP_STRING([--disable-privport_prefer], [Disable preferred usage of privleged ports.])) +if test "x$enable_privport_prefer" = "xno"; then + CFLAGS="${CFLAGS} -DNO_PRIVPORT" fi case $host_os in @@ -908,6 +922,16 @@ AC_SUBST(GF_DISTRIBUTION) GF_HOST_OS="" GF_LDFLAGS="-rdynamic" +TESTER_CFLAGS="" + +dnl include tirpc for FB builds +if test "x$BUILD_FBEXTRAS" = "xyes"; then + TIRPC_CFLAGS="-I/usr/include/tirpc" + GF_LDFLAGS="-lfbtirpc $GF_LDFLAGS" + GF_CFLAGS="$GF_CFLAGS $TIRPC_CFLAGS -DIPV6_DEFAULT" + TESTER_CFLAGS="$TESTER_CFLAGS -lfbtirpc" +fi + dnl check for gcc -Werror=format-security saved_CFLAGS=$CFLAGS CFLAGS="-Wformat -Werror=format-security" @@ -1099,6 +1123,12 @@ AC_ARG_ENABLE([debug], AC_HELP_STRING([--enable-debug], [Enable debug build options.])) +AC_ARG_ENABLE([mempool], + AC_HELP_STRING([--disable-mempool], + [Disable the Gluster memory pooler.])) +if test "x$enable_mempool" = "xno"; then + CFLAGS="${CFLAGS} -DDISABLE_MEMPOOL" +fi # syslog section AC_ARG_ENABLE([syslog], @@ -1294,12 +1324,14 @@ AC_SUBST([GF_CPPFLAGS]) AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS") AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS") AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS") +AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes") AC_SUBST(GLUSTERD_WORKDIR) AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd ) AC_SUBST(GLUSTERD_VOLFILE) AC_SUBST(GLUSTERFS_LIBEXECDIR) AC_SUBST(GLUSTERFSD_MISCDIR) +AC_SUBST(TESTER_CFLAGS) dnl pkg-config versioning dnl diff --git a/glusterfs.spec.in b/glusterfs.spec.in index b28bb426555..29bf00c60a9 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -13,6 +13,10 @@ # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with debug %{?_with_debug:%global _with_debug --enable-debug} +# if you wish to compile an rpm with Facebook specfic extras... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras +%{?_with_fbextras:%global _with_fbextras --with-fbextras} + # if you wish to compile an rpm with cmocka unit testing... # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka %{?_with_cmocka:%global _with_cmocka --enable-cmocka} @@ -196,6 +200,9 @@ BuildRequires: libxml2-devel openssl-devel BuildRequires: libaio-devel libacl-devel BuildRequires: python-devel BuildRequires: python-ctypes +%if ( 0%{?_with_fbextras:1} ) +BuildRequires: fb-libtirpc fb-libtirpc-devel +%endif BuildRequires: userspace-rcu-devel >= 0.7 %if ( 0%{?rhel} && 0%{?rhel} <= 6 ) BuildRequires: automake @@ -513,6 +520,9 @@ Requires: %{name}-cli%{?_isa} = %{version}-%{release} Requires: %{name}-libs%{?_isa} = %{version}-%{release} # some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse Requires: %{name}-fuse%{?_isa} = %{version}-%{release} +%if ( 0%{?_with_fbextras:1} ) +Requires: fb-libtirpc >= 0.2.5-1 +%endif # self-heal daemon, rebalance, nfs-server etc. are actually clients Requires: %{name}-api%{?_isa} = %{version}-%{release} Requires: %{name}-client-xlators%{?_isa} = %{version}-%{release} @@ -596,7 +606,8 @@ export CFLAGS %{?_without_ocf} \ %{?_without_rdma} \ %{?_without_syslog} \ - %{?_without_tiering} + %{?_without_tiering} \ + %{?_with_fbextras} # fix hardening and remove rpath in shlibs %if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index c47fa3883c9..556b82742cb 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -1903,9 +1903,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, switch (event) { case RPC_CLNT_DISCONNECT: - GF_LOG_OCCASIONALLY (log_ctr1, "glusterfsd-mgmt", GF_LOG_ERROR, - "failed to connect with remote-host: %s (%s)", - ctx->cmd_args.volfile_server, strerror (errno)); + ctx->cmd_args.connect_attempts++; + + gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, + "Connect attempt with remote-host: %s (%s) (%u/%d)", + ctx->cmd_args.volfile_server, + strerror (errno), + ctx->cmd_args.connect_attempts, + ctx->cmd_args.max_connect_attempts); if (!rpc->disabled) { /* * Check if dnscache is exhausted for current server @@ -1916,8 +1921,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, break; } } + + /* If we run out of servers, AND we attempted to connect + * max connect times, then we should return ENOTCONN + */ server = ctx->cmd_args.curr_server; - if (server->list.next == &ctx->cmd_args.volfile_servers) { + if ((ctx->cmd_args.connect_attempts >= + ctx->cmd_args.max_connect_attempts) && + server->list.next == &ctx->cmd_args.volfile_servers) { if (!ctx->active) need_term = 1; emval = ENOTCONN; @@ -1926,24 +1937,33 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, "Exhausted all volfile servers"); break; } - server = list_entry (server->list.next, typeof(*server), list); - ctx->cmd_args.curr_server = server; - ctx->cmd_args.volfile_server = server->volfile_server; - - ret = dict_set_str (rpc_trans->options, "remote-host", - server->volfile_server); - if (ret != 0) { - gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, - "failed to set remote-host: %s", + + /* If we exceed the # of connect attempts, we should + * move onto the next server + */ + if (ctx->cmd_args.connect_attempts >= + ctx->cmd_args.max_connect_attempts || !server) { + server = list_entry (server->list.next, + typeof(*server), list); + ctx->cmd_args.curr_server = server; + ctx->cmd_args.volfile_server = server->volfile_server; + + ret = dict_set_str (rpc_trans->options, "remote-host", + server->volfile_server); + if (ret != 0) { + gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, + "failed to set remote-host: %s", + server->volfile_server); + if (!ctx->active) + need_term = 1; + emval = ENOTCONN; + break; + } + ctx->cmd_args.connect_attempts = 0; + gf_log ("glusterfsd-mgmt", GF_LOG_INFO, + "connecting to next volfile server %s", server->volfile_server); - if (!ctx->active) - need_term = 1; - emval = ENOTCONN; - break; } - gf_log ("glusterfsd-mgmt", GF_LOG_INFO, - "connecting to next volfile server %s", - server->volfile_server); break; case RPC_CLNT_CONNECT: rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn); @@ -1960,7 +1980,7 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, } } - + ctx->cmd_args.connect_attempts = 0; if (is_mgmt_rpc_reconnect) glusterfs_mgmt_pmap_signin (ctx); @@ -2136,7 +2156,8 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx) !strcmp (cmd_args->volfile_server_transport, "unix")) { ret = rpc_transport_unix_options_build (&options, host, 0); } else { - ret = rpc_transport_inet_options_build (&options, host, port); + ret = rpc_transport_inet_options_build (&options, host, port, + NULL); } if (ret) goto out; diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c index 6c7a7c883fa..5022cfc22da 100644 --- a/glusterfsd/src/glusterfsd.c +++ b/glusterfsd/src/glusterfsd.c @@ -986,7 +986,7 @@ parse_opts (int key, char *arg, struct argp_state *state) cmd_args->debug_mode = ENABLE_DEBUG_MODE; break; case ARGP_VOLFILE_MAX_FETCH_ATTEMPTS: - cmd_args->max_connect_attempts = 1; + cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS; break; case ARGP_DIRECT_IO_MODE_KEY: @@ -1955,13 +1955,7 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx) } } - /* - This option was made obsolete but parsing it for backward - compatibility with third party applications - */ - if (cmd_args->max_connect_attempts) { - gf_msg ("glusterfs", GF_LOG_WARNING, 0, glusterfsd_msg_33); - } + cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS; #ifdef GF_DARWIN_HOST_OS if (cmd_args->mount_point) diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h index e442bede5db..b5c6b27b534 100644 --- a/glusterfsd/src/glusterfsd.h +++ b/glusterfsd/src/glusterfsd.h @@ -16,7 +16,7 @@ #define DEFAULT_GLUSTERD_VOLFILE CONFDIR "/glusterd.vol" #define DEFAULT_CLIENT_VOLFILE CONFDIR "/glusterfs.vol" #define DEFAULT_SERVER_VOLFILE CONFDIR "/glusterfsd.vol" - +#define DEFAULT_MAX_CONNECT_ATTEMPTS 200 #define DEFAULT_EVENT_POOL_SIZE 16384 #define ARGP_LOG_LEVEL_NONE_OPTION "NONE" diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 18f445ae265..6a5889207d4 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -181,26 +181,16 @@ gf_rev_dns_lookup (const char *ip) { char *fqdn = NULL; int ret = 0; - struct sockaddr_in sa = {0}; - char host_addr[256] = {0, }; GF_VALIDATE_OR_GOTO ("resolver", ip, out); - sa.sin_family = AF_INET; - inet_pton (AF_INET, ip, &sa.sin_addr); - ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr, - sizeof (host_addr), NULL, 0, 0); - + /* Get the FQDN */ + ret = gf_get_hostname_from_ip ((char *)ip, &fqdn); if (ret != 0) { gf_msg ("resolver", GF_LOG_INFO, errno, LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve " "hostname for %s", ip); - goto out; } - - /* Get the FQDN */ - fqdn = gf_strdup (host_addr); - out: return fqdn; } @@ -3107,11 +3097,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) char *client_ip_copy = NULL; char *tmp = NULL; char *ip = NULL; + size_t addr_sz = 0; /* if ipv4, reverse lookup the hostname to * allow FQDN based rpc authentication */ - if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) { + if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) && + !valid_ipv4_address (client_ip, strlen (client_ip), 0)) { /* most times, we get a.b.c.d:port form, so check that */ client_ip_copy = gf_strdup (client_ip); if (!client_ip_copy) @@ -3124,12 +3116,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *)&client_sock_in; + addr_sz = sizeof (client_sock_in); client_sock_in.sin_family = AF_INET; ret = inet_pton (AF_INET, ip, (void *)&client_sock_in.sin_addr.s_addr); } else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *) &client_sock_in6; + addr_sz = sizeof (client_sock_in6); client_sock_in6.sin6_family = AF_INET6; ret = inet_pton (AF_INET6, ip, @@ -3143,8 +3137,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) goto out; } + /* You cannot just use sizeof (*client_sockaddr), as per the man page + * the (getnameinfo) size must be the size of the underlying sockaddr + * struct e.g. sockaddr_in6 or sockaddr_in. Failure to do so will + * break IPv6 hostname resolution (IPv4 will work only because + * the sockaddr_in struct happens to be of the correct size). + */ ret = getnameinfo (client_sockaddr, - sizeof (*client_sockaddr), + addr_sz, client_hostname, sizeof (client_hostname), NULL, 0, 0); if (ret) { diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h index ea722028eb5..56736e52052 100644 --- a/libglusterfs/src/compat.h +++ b/libglusterfs/src/compat.h @@ -467,6 +467,12 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0); #endif +#ifdef GF_BSD_HOST_OS +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME +#endif + +#ifndef IPV6_DEFAULT + #ifndef IXDR_GET_LONG #define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf)) #endif @@ -483,6 +489,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG(buf, (long)(v)) #endif +#endif /* IPV6_DEFAULT */ + #if defined(__GNUC__) && !defined(RELAX_POISONING) /* Use run API, see run.h */ #include <stdlib.h> /* system(), mkostemp() */ diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 25ddff0d8c4..6a61e641e19 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -27,6 +27,45 @@ #include "statedump.h" #include "libglusterfs-messages.h" +/* this goes with the bucket_size lookup table below */ +#define NUM_DISTINCT_SIZES_32_BIT 32 + +/* this bucket_size lookup table is borrowed from GNU libstdc++ */ +static const uint32_t bucket_sizes[NUM_DISTINCT_SIZES_32_BIT] = { + /* 0 */ 5ul, + /* 1 */ 11ul, + /* 2 */ 23ul, + /* 3 */ 47ul, + /* 4 */ 97ul, + /* 5 */ 199ul, + /* 6 */ 409ul, + /* 7 */ 823ul, + /* 8 */ 1741ul, + /* 9 */ 3469ul, + /* 10 */ 6949ul, + /* 11 */ 14033ul, + /* 12 */ 28411ul, + /* 13 */ 57557ul, + /* 14 */ 116731ul, + /* 15 */ 236897ul, + /* 16 */ 480881ul, + /* 17 */ 976369ul, + /* 18 */ 1982627ul, + /* 19 */ 4026031ul, + /* 20 */ 8175383ul, + /* 21 */ 16601593ul, + /* 22 */ 33712729ul, + /* 23 */ 68460391ul, + /* 24 */ 139022417ul, + /* 25 */ 282312799ul, + /* 26 */ 573292817ul, + /* 27 */ 1164186217ul, + /* 28 */ 2364114217ul, + /* 29 */ 4294967291ul, + /* 30 */ 4294967291ul, + /* 31 */ 4294967291ul, +}; + struct dict_cmp { dict_t *dict; gf_boolean_t (*value_ignore) (char *k); @@ -47,7 +86,7 @@ get_new_data () } dict_t * -get_new_dict_full (int size_hint) +get_new_dict_full (uint32_t size_hint) { dict_t *dict = mem_get0 (THIS->ctx->dict_pool); @@ -67,17 +106,8 @@ get_new_dict_full (int size_hint) dict->members = &dict->members_internal; } else { - /* - * We actually need to allocate space for size_hint *pointers* - * but we actually allocate space for one *structure*. Since - * a data_pair_t consists of five pointers, we're wasting four - * pointers' worth for N=1, and will overrun what we allocated - * for N>5. If anybody ever starts using size_hint, we'll need - * to fix this. - */ - GF_ASSERT (size_hint <= - (sizeof(data_pair_t) / sizeof(data_pair_t *))); - dict->members = mem_get0 (THIS->ctx->dict_pair_pool); + dict->members = GF_CALLOC (size_hint, sizeof (data_pair_t *), + gf_common_mt_data_pair_t); if (!dict->members) { mem_put (dict); return NULL; @@ -108,6 +138,35 @@ dict_new (void) return dict; } +dict_t * +dict_new_by_size (uint32_t num) +{ + int32_t highest_bit = 0; + uint32_t bucket_size = 0; + dict_t *dict = NULL; + + if (num == 0) + goto out; + +#ifdef _GNU_SOURCE + highest_bit = 32 - __builtin_clz (num); +#else + while (num != 0) { + highest_bit++; + num >>= 1; + } +#endif + + bucket_size = bucket_sizes[highest_bit - 1]; + dict = get_new_dict_full (bucket_size); + + if (dict) + dict_ref (dict); + +out: + return dict; +} + int32_t is_data_equal (data_t *one, data_t *two) @@ -268,7 +327,7 @@ err_out: static data_pair_t * dict_lookup_common (dict_t *this, char *key) { - int hashval = 0; + uint32_t hashval = 0; if (!this || !key) { gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG, @@ -279,7 +338,7 @@ dict_lookup_common (dict_t *this, char *key) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) + if (this->hash_size > 1) hashval = SuperFastHash (key, strlen (key)) % this->hash_size; data_pair_t *pair; @@ -319,7 +378,7 @@ dict_lookup (dict_t *this, char *key, data_t **data) static int32_t dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace) { - int hashval = 0; + uint32_t hashval = 0; data_pair_t *pair; char key_free = 0; int tmp = 0; @@ -336,7 +395,7 @@ dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) { + if (this->hash_size > 1) { tmp = SuperFastHash (key, strlen (key)); hashval = (tmp % this->hash_size); } @@ -478,7 +537,7 @@ dict_get (dict_t *this, char *key) void dict_del (dict_t *this, char *key) { - int hashval = 0; + uint32_t hashval = 0; if (!this || !key) { gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL, @@ -491,7 +550,7 @@ dict_del (dict_t *this, char *key) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) + if (this->hash_size > 1) hashval = SuperFastHash (key, strlen (key)) % this->hash_size; data_pair_t *pair = this->members[hashval]; diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index c5b82677e2e..1f6c1a0eae9 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -79,9 +79,9 @@ struct _data_pair { struct _dict { unsigned char is_static:1; - int32_t hash_size; - int32_t count; - int32_t refcount; + uint32_t hash_size; + uint32_t count; + uint32_t refcount; data_pair_t **members; data_pair_t *members_list; char *extra_free; @@ -156,7 +156,7 @@ void *data_to_ptr (data_t *data); data_t *get_new_data (); data_t * data_copy (data_t *old); -dict_t *get_new_dict_full (int size_hint); +dict_t *get_new_dict_full (uint32_t size_hint); dict_t *get_new_dict (); int dict_foreach (dict_t *this, @@ -196,6 +196,7 @@ int dict_keys_join (void *value, int size, dict_t *dict, /* CLEANED UP FUNCTIONS DECLARATIONS */ GF_MUST_CHECK dict_t *dict_new (void); +GF_MUST_CHECK dict_t *dict_new_by_size (uint32_t num); dict_t *dict_copy_with_ref (dict_t *this, dict_t *new); GF_MUST_CHECK int dict_reset (dict_t *dict); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 6e2d370605b..399d695665b 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -330,6 +330,7 @@ struct _cmd_args { uint32_t log_buf_size; uint32_t log_flush_timeout; int32_t max_connect_attempts; + unsigned int connect_attempts; char *print_exports; char *print_netgroups; /* advanced options */ diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c index 17cd68fc206..fa3ac840c43 100644 --- a/libglusterfs/src/iobuf.c +++ b/libglusterfs/src/iobuf.c @@ -30,8 +30,8 @@ struct iobuf_init_config gf_iobuf_init_config[] = { {8 * 1024, 128}, {32 * 1024, 64}, {128 * 1024, 32}, - {256 * 1024, 8}, - {1 * 1024 * 1024, 2}, + {256 * 1024, 64}, + {1 * 1024 * 1024, 64}, }; int diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c index 611615949fa..3399cc7c297 100644 --- a/libglusterfs/src/latency.c +++ b/libglusterfs/src/latency.c @@ -21,6 +21,7 @@ #include "statedump.h" #include "libglusterfs-messages.h" +static int gf_set_fop_from_fn_pointer_warning; void gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn) { @@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void fop = GF_FOP_READDIRP; else if (fops->getspec == *(fop_getspec_t *)&fn) fop = GF_FOP_GETSPEC; - else - fop = -1; + else if (fops->ipc == *(fop_ipc_t *)&fn) + fop = GF_FOP_IPC; + else { + fop = GF_FOP_NULL; + GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning, + "latency", + GF_LOG_WARNING, + "Unknown FOP type"); + } frame->op = fop; } diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c index 88fbdf58319..4d81ade8b60 100644 --- a/libglusterfs/src/mem-pool.c +++ b/libglusterfs/src/mem-pool.c @@ -454,6 +454,10 @@ mem_get0 (struct mem_pool *mem_pool) void * mem_get (struct mem_pool *mem_pool) { +#ifdef DISABLE_MEMPOOL + return GF_CALLOC (1, mem_pool->real_sizeof_type, + gf_common_mt_mem_pool); +#else struct list_head *list = NULL; void *ptr = NULL; int *in_use = NULL; @@ -525,6 +529,7 @@ fwd_addr_out: UNLOCK (&mem_pool->lock); return ptr; +#endif /* DISABLE_MEMPOOL */ } @@ -551,6 +556,10 @@ __is_member (struct mem_pool *pool, void *ptr) void mem_put (void *ptr) { +#ifdef DISABLE_MEMPOOL + GF_FREE (ptr); + return; +#else struct list_head *list = NULL; int *in_use = NULL; void *head = NULL; @@ -628,6 +637,7 @@ mem_put (void *ptr) } } UNLOCK (&pool->lock); +#endif /* DISABLE_MEMPOOL */ } void diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index afa52d8bc45..fc7bf9e5996 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -168,6 +168,7 @@ enum gf_common_mem_types_ { /*lock migration*/ gf_common_mt_lock_mig, gf_common_mt_pthread_t, + gf_common_ping_local_t, gf_common_mt_end }; #endif diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c index f7b2bea2f30..903303d1380 100644 --- a/libglusterfs/src/timespec.c +++ b/libglusterfs/src/timespec.c @@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta) ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000); ts->tv_sec += delta.tv_sec; } + +void timespec_sub (const struct timespec *begin, const struct timespec *end, + struct timespec *res) +{ + if (end->tv_nsec < begin->tv_nsec) { + res->tv_sec = end->tv_sec - begin->tv_sec - 1; + res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec; + } else { + res->tv_sec = end->tv_sec - begin->tv_sec; + res->tv_nsec = end->tv_nsec - begin->tv_nsec; + } +} diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h index f37194b97cf..9c393ee7166 100644 --- a/libglusterfs/src/timespec.h +++ b/libglusterfs/src/timespec.h @@ -20,5 +20,8 @@ void timespec_now (struct timespec *ts); void timespec_adjust_delta (struct timespec *ts, struct timespec delta); +void timespec_sub (const struct timespec *begin, + const struct timespec *end, + struct timespec *res); #endif /* __INCLUDE_TIMESPEC_H__ */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 3c1cde50fa0..b2529d3c4f7 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -117,6 +117,14 @@ out: } +static const char *xlator_lib_path (void) +{ + const char *libdir_env = getenv ("GLUSTER_LIBDIR"); + + return libdir_env ? libdir_env : XLATORDIR; +} + + int xlator_volopt_dynload (char *xlator_type, void **dl_handle, volume_opt_list_t *opt_list) @@ -130,9 +138,11 @@ xlator_volopt_dynload (char *xlator_type, void **dl_handle, /* socket.so doesn't fall under the default xlator directory, hence we * need this check */ if (!strstr(xlator_type, "rpc-transport")) - ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xlator_type); + ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), + xlator_type); else - ret = gf_asprintf (&name, "%s/%s.so", XLATORPARENTDIR, xlator_type); + ret = gf_asprintf (&name, "%s/../%s.so", xlator_lib_path (), + xlator_type); if (-1 == ret) { goto out; } @@ -183,7 +193,7 @@ xlator_dynload (xlator_t *xl) INIT_LIST_HEAD (&xl->volume_options); - ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xl->type); + ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), xl->type); if (-1 == ret) { goto out; } diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index 70e6f0a108d..2e04893c487 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -927,6 +927,7 @@ struct _xlator { gf_loglevel_t loglevel; /* Log level for translator */ + fop_latency_t client_latency; /* for latency measurement */ fop_latency_t latencies[GF_FOP_MAXVALUE]; @@ -17,7 +17,7 @@ done shift $((OPTIND-1)) -branch="release-3.8"; +branch="release-3.8-fb"; set_hooks_commit_msg() { diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index a7ff866ac99..7ce066dec5f 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -18,6 +18,7 @@ #include "mem-pool.h" #include "xdr-rpc.h" #include "rpc-common-xdr.h" +#include "timespec.h" char *clnt_ping_procs[GF_DUMP_MAXVALUE] = { @@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = { .procnames = clnt_ping_procs, }; +struct ping_local { + struct rpc_clnt *rpc; + struct timespec submit_time; +}; + /* Must be called under conn->lock */ static int __rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk) @@ -166,16 +172,48 @@ out: return; } +void +_update_client_latency (const rpc_clnt_connection_t *conn, + call_frame_t *frame, + uint64_t elapsed_usec) +{ + fop_latency_t *lat; + + lat = &frame->this->client_latency; + + if (elapsed_usec < lat->min) { + lat->min = elapsed_usec; + } + + if (elapsed_usec > lat->max) { + lat->max = elapsed_usec; + } + + lat->total += elapsed_usec; + lat->count++; + lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count; + gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, " + "avg: %0.6lf ms, count:%ld", + conn->trans->peerinfo.identifier, elapsed_usec / 1000.0, + lat->mean / 1000.0, lat->count); +} + int rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { - struct rpc_clnt *rpc = NULL; + struct ping_local *local = NULL; xlator_t *this = NULL; rpc_clnt_connection_t *conn = NULL; + call_frame_t *frame = NULL; struct timespec timeout = {0, }; + struct timespec now; + struct timespec delta; + int64_t latency_usec = 0; + int ret = 0; int unref = 0; + gf_boolean_t call_notify = _gf_false; if (!myframe) { gf_log (THIS->name, GF_LOG_WARNING, @@ -185,14 +223,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, frame = myframe; this = frame->this; - rpc = frame->local; - frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */ - conn = &rpc->conn; + local = frame->local; + conn = &local->rpc->conn; pthread_mutex_lock (&conn->lock); { if (req->rpc_status == -1) { - unref = rpc_clnt_remove_ping_timer_locked (rpc); + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (unref) { gf_log (this->name, GF_LOG_WARNING, "socket or ib related error"); @@ -207,8 +244,15 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, goto unlock; } - unref = rpc_clnt_remove_ping_timer_locked (rpc); - if (__rpc_clnt_rearm_ping_timer (rpc, + timespec_now (&now); + timespec_sub (&local->submit_time, &now, &delta); + latency_usec = delta.tv_sec * 1000000UL + + delta.tv_nsec / 1000UL; + + _update_client_latency (conn, frame, latency_usec); + call_notify = _gf_true; + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); + if (__rpc_clnt_rearm_ping_timer (local->rpc, rpc_clnt_start_ping) == -1) { gf_log (this->name, GF_LOG_WARNING, "failed to set the ping timer"); @@ -217,12 +261,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, } unlock: pthread_mutex_unlock (&conn->lock); + + if (call_notify) { + ret = local->rpc->notifyfn (local->rpc, this, + RPC_CLNT_PING, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "RPC_CLNT_PING notify failed"); + } + } out: if (unref) - rpc_clnt_unref (rpc); + rpc_clnt_unref (local->rpc); - if (frame) + if (frame) { + GF_FREE (frame->local); + frame->local = NULL; STACK_DESTROY (frame->root); + } return 0; } @@ -232,18 +288,27 @@ rpc_clnt_ping (struct rpc_clnt *rpc) call_frame_t *frame = NULL; int32_t ret = -1; rpc_clnt_connection_t *conn = NULL; + struct ping_local *local = NULL; conn = &rpc->conn; + local = GF_MALLOC (sizeof(struct ping_local), gf_common_ping_local_t); + if (!local) + return ret; frame = create_frame (THIS, THIS->ctx->pool); - if (!frame) + if (!frame) { + GF_FREE (local); return ret; + } - frame->local = rpc; + local->rpc = rpc; + timespec_now (&local->submit_time); + frame->local = local; ret = rpc_clnt_submit (rpc, &clnt_ping_prog, GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0, NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL); if (ret) { + /* FIXME: should we free the frame here? Methinks so! */ gf_log (THIS->name, GF_LOG_ERROR, "failed to start ping timer"); } diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index 3a5b287cd49..2ccaa56e4cb 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -19,6 +19,7 @@ typedef enum { RPC_CLNT_CONNECT, RPC_CLNT_DISCONNECT, + RPC_CLNT_PING, RPC_CLNT_MSG, RPC_CLNT_DESTROY } rpc_clnt_event_t; diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c index e224dcc022e..5556740ca81 100644 --- a/rpc/rpc-lib/src/rpc-transport.c +++ b/rpc/rpc-lib/src/rpc-transport.c @@ -166,6 +166,19 @@ out: +int rpc_transport_lib_path (char **name, char *type) +{ + int ret = -1; + char *libdir_env = getenv ("GLUSTER_LIBDIR"); + + ret = libdir_env == NULL + ? gf_asprintf (name, "%s/%s.so", RPC_TRANSPORTDIR, type) + : gf_asprintf (name, "%s/rpc-transport/%s.so", libdir_env, type); + return ret; +} + + + rpc_transport_t * rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name) { @@ -274,7 +287,7 @@ rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name) goto fail; } - ret = gf_asprintf (&name, "%s/%s.so", RPC_TRANSPORTDIR, type); + ret = rpc_transport_lib_path (&name, type); if (-1 == ret) { goto fail; } @@ -652,18 +665,37 @@ out: return ret; } +/** @brief build a dictionary containing basic transport options. + * + * @param[out] options: will be set to a newly created dictionary on success. + * @param[in] hostname: desired target hostname. + * @param[in] port: desired target port. + * @param[in] addr_family (optional): desired address family. If NULL, + * default will be used. + * + * @returns zero on success. + */ int rpc_transport_inet_options_build (dict_t **options, const char *hostname, - int port) + int port, const char *addr_family) { dict_t *dict = NULL; char *host = NULL; int ret = -1; +#ifdef IPV6_DEFAULT + const char *addr_family_default = "inet6"; +#else + const char *addr_family_default = "inet"; +#endif GF_ASSERT (options); GF_ASSERT (hostname); GF_ASSERT (port >= 1024); + if (!addr_family) { + addr_family = addr_family_default; + } + dict = dict_new (); if (!dict) goto out; @@ -688,6 +720,14 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, goto out; } + ret = dict_set_str (dict, "transport.address-family", + (char *)addr_family); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set address-family to %s", addr_family); + goto out; + } + ret = dict_set_str (dict, "transport-type", "socket"); if (ret) { gf_log (THIS->name, GF_LOG_WARNING, diff --git a/rpc/rpc-lib/src/rpc-transport.h b/rpc/rpc-lib/src/rpc-transport.h index f0add065065..0f555462ea4 100644 --- a/rpc/rpc-lib/src/rpc-transport.h +++ b/rpc/rpc-lib/src/rpc-transport.h @@ -311,5 +311,6 @@ rpc_transport_unix_options_build (dict_t **options, char *filepath, int frame_timeout); int -rpc_transport_inet_options_build (dict_t **options, const char *hostname, int port); +rpc_transport_inet_options_build (dict_t **options, const char *hostname, + int port, const char *addr_family); #endif /* __RPC_TRANSPORT_H__ */ diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c index f07e745a4b3..9dc3bff427c 100644 --- a/rpc/rpc-lib/src/rpcsvc.c +++ b/rpc/rpc-lib/src/rpcsvc.c @@ -37,6 +37,10 @@ #include <stdarg.h> #include <stdio.h> +#ifdef IPV6_DEFAULT +#include <netconfig.h> +#endif + #include "xdr-rpcclnt.h" #include "glusterfs-acl.h" @@ -1363,6 +1367,82 @@ rpcsvc_error_reply (rpcsvc_request_t *req) return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL); } +#ifdef IPV6_DEFAULT +int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port) +{ + const int IP_BUF_LEN = 64; + char addr_buf[IP_BUF_LEN]; + + int err = 0; + bool_t success = 0; + struct netconfig *nc; + struct netbuf *nb; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + + err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff, + port & 0xff); + if (err < 0) { + err = -1; + goto out; + } + + nb = uaddr2taddr (nc, addr_buf); + if (!nb) { + err = -1; + goto out; + } + + success = rpcb_set (newprog->prognum, newprog->progver, nc, nb); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6" + " service with rpcbind"); + } + + err = 0; + +out: + return err; +} + +int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog) +{ + int err = 0; + bool_t success = 0; + struct netconfig *nc; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + success = rpcb_unset (newprog->prognum, newprog->progver, nc); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6" + " service with rpcbind"); + } + + err = 0; +out: + return err; +} +#endif /* Register the program with the local portmapper service. */ int @@ -1527,7 +1607,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) " program failed"); goto out; } - +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_unregister_rpcbind6 (program); + if (ret == -1) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)" + " unregistration of program failed"); + goto out; + } +#endif pthread_mutex_lock (&svc->rpclock); { list_for_each_entry (prog, &svc->programs, program) { diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h index 02e467e68a7..1032df03b0e 100644 --- a/rpc/rpc-lib/src/rpcsvc.h +++ b/rpc/rpc-lib/src/rpcsvc.h @@ -437,6 +437,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener); extern int rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port); +#ifdef IPV6_DEFAULT +extern int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port); +#endif + extern int rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog); diff --git a/rpc/rpc-transport/rdma/src/name.c b/rpc/rpc-transport/rdma/src/name.c index 8003b1c87a0..b9d3269eb73 100644 --- a/rpc/rpc-transport/rdma/src/name.c +++ b/rpc/rpc-transport/rdma/src/name.c @@ -54,6 +54,10 @@ af_inet_bind_to_port_lt_ceiling (struct rdma_cm_id *cm_id, struct sockaddr *sockaddr, socklen_t sockaddr_len, uint32_t ceiling) { +#if defined(NO_PRIVPORT) + _assign_port(sockaddr, 0); + return rdma_bind_addr (cm_id, sockaddr); +#else int32_t ret = -1; uint16_t port = ceiling - 1; gf_boolean_t ports[GF_PORT_MAX]; @@ -100,6 +104,7 @@ loop: } return ret; +#endif /* NO_PRIVPORT */ } #if 0 diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c index 0e34dc211fe..cab4161c076 100644 --- a/rpc/rpc-transport/socket/src/name.c +++ b/rpc/rpc-transport/socket/src/name.c @@ -42,6 +42,10 @@ static int32_t af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, socklen_t sockaddr_len, uint32_t ceiling) { +#if defined(NO_PRIVPORT) + _assign_port(sockaddr, 0); + return bind (fd, sockaddr, sockaddr_len); +#else int32_t ret = -1; uint16_t port = ceiling - 1; gf_boolean_t ports[GF_PORT_MAX]; @@ -88,6 +92,7 @@ loop: } return ret; +#endif /* NO_PRIVPORT */ } static int32_t @@ -557,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) data_t *address_family_data = NULL; int32_t ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; + sa_family_t default_family = AF_INET6; +#else + char *addr_family = "inet"; + sa_family_t default_family = AF_INET; +#endif + GF_VALIDATE_OR_GOTO ("socket", sa_family, out); address_family_data = dict_get (this->options, @@ -581,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) } } else { gf_log (this->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting to inet"); - *sa_family = AF_INET; + "option address-family not specified, " + "defaulting to %s", addr_family); + *sa_family = default_family; } ret = 0; diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index ae551dcfae7..8c1690f820c 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -866,7 +866,7 @@ __socket_keepalive (int fd, int family, int keepalive_intvl, goto err; } #else - if (family != AF_INET) + if (family != AF_INET && family != AF_INET6) goto done; ret = setsockopt (fd, IPPROTO_TCP, TCP_KEEPIDLE, &keepalive_idle, @@ -3009,6 +3009,21 @@ socket_connect (rpc_transport_t *this, int port) } } + /* Make sure we are not vulnerable to someone setting + * net.ipv6.bindv6only to 1 so that gluster services are + * avalable over IPv4 & IPv6. + */ + int disable_v6only = 0; + + if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&disable_v6only, + sizeof (disable_v6only)) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Error disabling sockopt IPV6_V6ONLY: \"%s\"", + strerror (errno)); + } + + if (priv->nodelay && (sa_family != AF_UNIX)) { ret = __socket_nodelay (priv->sock); diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x index 8462dcc258a..5ec8109d828 100644 --- a/rpc/xdr/src/glusterfs-fops.x +++ b/rpc/xdr/src/glusterfs-fops.x @@ -84,6 +84,7 @@ enum glusterfs_event_t { GF_EVENT_UPCALL, GF_EVENT_SCRUB_STATUS, GF_EVENT_SOME_CHILD_DOWN, + GF_EVENT_CHILD_PING, GF_EVENT_MAXVAL }; diff --git a/run-tests.sh b/run-tests.sh index 1487f30d832..866ab0464b4 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -182,12 +182,14 @@ function get_test_status () # for later. Why does the key have the distro and version then? # Because changing the key in all test files would be very big process # updating just this function with a better logic much simpler. + # + # FB Edit: For FB tests we are disabling NetBSD testing. + # Linux) result=$(grep -e "^#G_TESTDEF_TEST_STATUS_CENTOS6" $test_name | \ awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;; NetBSD) - result=$(grep -e "^#G_TESTDEF_TEST_STATUS_NETBSD7" $test_name | \ - awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;; + result="KNOWN_ISSUE" ;; *) result="ENABLED" ;; esac diff --git a/tests/basic/accept-v6v4.t b/tests/basic/accept-v6v4.t new file mode 100644 index 00000000000..7128c12c6be --- /dev/null +++ b/tests/basic/accept-v6v4.t @@ -0,0 +1,122 @@ +#!/bin/bash + +. $(dirname $0)/../nfs.rc + +# +# This test ensures that GlusterFS provides NFS, Mount and its Management daemon +# over both IPv4 and IPv6. It uses netcat to check the services running on both +# IPv4 & IPv6 addresses as well as a mount to test that mount & nfs work. +# + +IPV4_SUPPORT=false +IPV6_SUPPORT=false + +host $HOSTNAME | grep -q "has address" && IPV4_SUPPORT=true +host $HOSTNAME | grep -q "has IPv6 address" && IPV6_SUPPORT=true + +. $(dirname $0)/../include.rc + +cleanup; + +mkdir -p $B0/b{0,1,2} + +# make sure no registered rpcbind services are running +service rpcbind restart + +TEST glusterd +TEST pidof glusterd + +TEST $CLI vol create $V0 replica 3 $H0:$B0/b0 $H0:$B0/b1 $H0:$B0/b2 + +TEST $CLI vol set $V0 cluster.self-heal-daemon off +TEST $CLI vol set $V0 nfs.disable off +TEST $CLI vol set $V0 cluster.choose-local off +TEST $CLI vol start $V0 + +MOUNTD_PORT=38465 +MGMTD_PORT=24007 +NFSD_PORT=2049 + +function check_ip_port { + ip=$1 + port=$2 + type=$3 + + nc_flags="" + if [ "$type" == "v6" ] && [ "$ip" == "NONE" ]; then + echo "Y" + return + else + nc_flags="-6" + fi + + if [ "$type" == "v4" ] && [ "$ip" == "NONE" ]; then + echo "Y" + return + fi + + if exec 3<>/dev/tcp/$ip/$port; then + echo "Y" + else + echo "N" + fi +} + +function check_nfs { + ip=$1 + type=$2 + + if [ "$ip" == "NONE" ]; then + echo "Y" + return + fi + + if [ "$type" == "v6" ]; then + addr="[$ip]" + else + addr="$ip" + fi + + if mount_nfs $addr:/$V0 $N0; then + umount_nfs $N0 + echo "Y" + else + echo "N" + fi +} + +if [ ! $IPV4_SUPPORT ] && [ ! $IPV6_SUPPORT ]; then + exit 1 +fi + +# Get the V4 & V6 addresses of this host +if $IPV4_SUPPORT; then + V4=$(host $HOSTNAME | head -n1 | awk -F ' ' '{print $4}') +else + V4="NONE" +fi + +if $IPV6_SUPPORT; then + V6=$(host $HOSTNAME | tail -n1 | awk -F ' ' '{print $5}') +else + V6="NONE" +fi + +# First check the management daemon +EXPECT "Y" check_ip_port $V6 $MGMTD_PORT "v6" +EXPECT "Y" check_ip_port $V4 $MGMTD_PORT "v4" + +# Give the MOUNT/NFS Daemon some time to start up +sleep 4 + +EXPECT "Y" check_ip_port $V4 $MOUNTD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $MOUNTD_PORT "v4" + +EXPECT "Y" check_ip_port $V4 $NFSD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $NFSD_PORT "v4" + +# Mount the file system +EXPECT "Y" check_nfs $V6 "v6" +EXPECT "Y" check_nfs $V4 "v4" + +cleanup; diff --git a/tests/basic/dht-min-free-space.t b/tests/basic/dht-min-free-space.t new file mode 100755 index 00000000000..17d10cc39a5 --- /dev/null +++ b/tests/basic/dht-min-free-space.t @@ -0,0 +1,78 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +grep $B0/patchy1 /proc/mounts &> /dev/null && umount $B0/patchy1 +grep $B0/patchy2 /proc/mounts &> /dev/null && umount $B0/patchy2 +losetup -d /dev/loop0 2> /dev/null +losetup -d /dev/loop1 2> /dev/null +mkdir $B0/${V0}{1..2} + +TEST glusterd + +TEST dd if=/dev/zero of=/tmp/${V0}-dev1 bs=1M count=30 +TEST dd if=/dev/zero of=/tmp/${V0}-dev2 bs=1M count=30 + +TEST losetup /dev/loop0 /tmp/${V0}-dev1 +TEST losetup /dev/loop1 /tmp/${V0}-dev2 + +TEST mkfs.xfs /dev/loop0 +TEST mkfs.xfs /dev/loop1 + +TEST mount /dev/loop0 $B0/${V0}1 +TEST mount /dev/loop1 $B0/${V0}2 + +TEST $CLI volume create $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}2 +TEST $CLI volume set $V0 cluster.min-free-disk 2MB +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 0 +TEST $CLI volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +#################################### +# Test re-directs of file creation # +#################################### + +# This should work, no redirects +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=8 +TEST [ -f /d/backends/${V0}2/testfile1 ] && [ ! -k /d/backends/${V0}1/testfile1 ] + +TEST $CLI volume set $V0 cluster.min-free-disk 19MB + +# This should work, & the file redirected +# Subvolume 2 should have the linkto & +# Subvolume 1 should have the original +TEST dd if=/dev/zero of=$M0/testfile3 bs=1M count=4 +TEST [ -f /d/backends/${V0}1/testfile3 ] && [ ! -k /d/backends/${V0}1/testfile3 ] +TEST [ -k /d/backends/${V0}2/testfile3 ] + +# This should fail, cluster is full +TEST ! dd if=/dev/zero of=$M0/testfile2 bs=1M count=23 + +################### +# Strict mode off # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode off +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=20 +TEST rm -f $M0/testfile1 + +################### +# Strict mode on # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST ! dd if=/dev/zero of=$M0/testfile1 bs=1M count=16 +TEST rm -f $M0/testfile1 + +killall gluster{fs,fsd,d} + +umount -lf $B0/${V0}1 +umount -lf $B0/${V0}2 + +losetup -d /dev/loop0 +losetup -d /dev/loop1 + +cleanup; diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common index 83c4463a912..152e3b51236 100644 --- a/tests/basic/ec/ec-common +++ b/tests/basic/ec/ec-common @@ -45,7 +45,7 @@ for size in $SIZE_LIST; do eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }') done -TEST df -h +TEST df -h $M0 TEST stat $M0 for idx in `seq 0 $LAST_BRICK`; do diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 98dd9232c73..3e3467535fb 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -136,7 +136,7 @@ TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024 cs=$(sha1sum $tmp/test | awk '{ print $1 }') -TEST df -h +TEST df -h $M0 TEST stat $M0 for idx in {0..5}; do diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t index fdaf9c2822e..da88bbcb2cc 100644 --- a/tests/basic/exports_parsing.t +++ b/tests/basic/exports_parsing.t @@ -32,7 +32,20 @@ function test_bad_opt () glusterfsd --print-exports $1 2>&1 | sed -n 1p } -EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports +function check_export_line() { + if [ "$1" == "$2" ]; then + echo "Y" + else + echo "N" + fi + return +} + +export_result=$(test_good_file $EXP_FILES/exports) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result" + +export_result=$(test_good_file $EXP_FILES/exports-v6) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result" EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t index cea8aa737c0..713c7e27579 100644 --- a/tests/basic/fop-sampling.t +++ b/tests/basic/fop-sampling.t @@ -2,13 +2,27 @@ # . $(dirname $0)/../include.rc +. $(dirname $0)/../nfs.rc . $(dirname $0)/../volume.rc -SAMPLE_FILE="$(gluster --print-logdir)/samples/glusterfs_${V0}.samp" +BRICK_SAMPLES="$(gluster --print-logdir)/samples/glusterfsd__d_backends_${V0}0.samp" +NFS_SAMPLES="$(gluster --print-logdir)/samples/glusterfs_nfsd.samp" + +function check_path { + op=$1 + path=$2 + file=$3 + grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null + if [ $? -eq 0 ]; then + echo "Y" + else + echo "N" + fi +} function print_cnt() { local FOP_TYPE=$1 - local FOP_CNT=$(grep ,${FOP_TYPE} ${SAMPLE_FILE} | wc -l) + local FOP_CNT=$(grep ,${FOP_TYPE} ${BRICK_SAMPLES} | wc -l) echo $FOP_CNT } @@ -42,12 +56,18 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} TEST $CLI volume set $V0 nfs.disable off TEST $CLI volume set $V0 diagnostics.latency-measurement on TEST $CLI volume set $V0 diagnostics.count-fop-hits on -TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 5 TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535 TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1 TEST $CLI volume set $V0 diagnostics.stats-dnscache-ttl-sec 3600 - TEST $CLI volume start $V0 + +>${NFS_SAMPLES} +>${BRICK_SAMPLES} + +################# +# Basic Samples # +################# TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 for i in {1..5} @@ -58,4 +78,52 @@ done TEST ls -l $M0 EXPECT_WITHIN 6 "OK" check_samples -cleanup +sleep 2 + +################################ +# Paths in the samples # +################################ + +TEST mount_nfs $H0:$V0 $N0 + +ls $N0 &> /dev/null +touch $N0/file1 +stat $N0/file1 &> /dev/null +echo "some data" > $N0/file1 +dd if=/dev/zero of=$N0/file2 bs=1M count=10 conv=fsync +dd if=/dev/zero of=$N0/file1 bs=1M count=1 +cat $N0/file2 &> /dev/null +mkdir -p $N0/dir1 +rmdir $N0/dir1 +rm $N0/file1 +rm $N0/file2 + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FINODELK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ENTRYLK / $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $BRICK_SAMPLES + + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path READ /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $NFS_SAMPLES + +cleanup; diff --git a/tests/basic/fops-sanity-gfproxy.t b/tests/basic/fops-sanity-gfproxy.t new file mode 100755 index 00000000000..b3bb8a502cc --- /dev/null +++ b/tests/basic/fops-sanity-gfproxy.t @@ -0,0 +1,32 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/brick1; +EXPECT 'Created' volinfo_field $V0 'Status'; + +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +#gfproxy server +TEST glusterfs --volfile-id=gfproxy/$V0 --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log + +#mount on a random dir +TEST glusterfs --entry-timeout=3600 --attribute-timeout=3600 -s $H0 --volfile-id=gfproxy-client/$V0 $M0 --direct-io-mode=yes +TEST grep gfproxy-client /proc/mounts + +build_tester $(dirname $0)/fops-sanity.c + +TEST cp $(dirname $0)/fops-sanity $M0 +cd $M0 +TEST ./fops-sanity $V0 +cd - +rm -f $(dirname $0)/fops-sanity + +cleanup; diff --git a/tests/basic/gfproxy.t b/tests/basic/gfproxy.t new file mode 100644 index 00000000000..71c6788db76 --- /dev/null +++ b/tests/basic/gfproxy.t @@ -0,0 +1,74 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../nfs.rc + +cleanup; + +function start_gfproxyd { + glusterfs --volfile-id=gfproxy/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log +} + +function restart_gfproxyd { + pkill -f gfproxy/${V0} + start_gfproxyd +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 config.gfproxyd-remote-host $H0 +TEST $CLI volume start $V0 + +sleep 2 + +REGULAR_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-fuse.vol" +GFPROXY_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-gfproxy-fuse.vol" +GFPROXYD_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.gfproxyd.vol" + +# Client volfile must exist +TEST [ -f $GFPROXY_CLIENT_VOLFILE ] + +# AHA & write-behind translators must exist +TEST grep "cluster/aha" $GFPROXY_CLIENT_VOLFILE +TEST grep "performance/write-behind" $GFPROXY_CLIENT_VOLFILE + +# Make sure we didn't screw up the existing client +TEST grep "performance/write-behind" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/replicate" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/distribute" $REGULAR_CLIENT_VOLFILE + +TEST [ -f $GFPROXYD_VOLFILE ] + +TEST grep "cluster/replicate" $GFPROXYD_VOLFILE +TEST grep "cluster/distribute" $GFPROXYD_VOLFILE + +# AHA & write-behind must *not* exist +TEST ! grep "cluster/aha" $GFPROXYD_VOLFILE +TEST ! grep "performance/write-behind" $GFPROXYD_VOLFILE + +# Test that we can start the server and the client +TEST start_gfproxyd +TEST glusterfs --volfile-id=gfproxy-client/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy-client.log $M0 +sleep 2 +TEST grep gfproxy-client/${V0} /proc/mounts + +# Write data to the mount and checksum it +TEST dd if=/dev/urandom bs=1M count=10 of=/tmp/testfile1 +md5=$(md5sum /tmp/testfile1 | awk '{print $1}') +TEST cp -v /tmp/testfile1 $M0/testfile1 +TEST [ "$(md5sum $M0/testfile1 | awk '{print $1}')" == "$md5" ] + +rm /tmp/testfile1 + +dd if=/dev/zero of=$N0/bigfile bs=1M count=3072 & +BG_STRESS_PID=$! + +sleep 3 + +restart_gfproxyd + +TEST wait $BG_STRESS_PID + +cleanup; diff --git a/tests/basic/glusterd/volfile_server_switch.t b/tests/basic/glusterd/volfile_server_switch.t index 0b0e6470244..0b01398215c 100644 --- a/tests/basic/glusterd/volfile_server_switch.t +++ b/tests/basic/glusterd/volfile_server_switch.t @@ -1,5 +1,8 @@ #!/bin/bash +#G_TESTDEF_TEST_STATUS_CENTOS6=KNOWN_ISSUE,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=KNOWN_ISSUE,BUG=000000 + . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc . $(dirname $0)/../../cluster.rc diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t new file mode 100644 index 00000000000..f3655eaef3b --- /dev/null +++ b/tests/basic/halo-failover-disabled.t @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +# brick immediatelly, and md5s will show they are equal once +# the write completes. +# 4. The mount should also be RW after the brick is killed as +# quorum will be immediately restored by swapping in the +# other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.halo-failover-enabled off +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG + +# Use a large ping time here so the spare brick is not marked up +# based on the ping time. The only way it can get marked up is +# by being swapped in via the down event (which is what we are disabling). +TEST $CLI volume set $V0 network.ping-timeout 1000 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX} + +# Make sure two children are down and one is up. +EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3 + +# Test that quorum should fail and the mount is RO, the reason here +# is that although there _is_ another brick running which _could_ +# take the failed bricks place, it is not marked "up" so quorum +# will not be fullfilled. If we waited 1000 second the brick would +# indeed be activated based on ping time, but for our test we want +# the decision to be solely "down event" driven, not ping driven. +TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX + +# Test that quorum should be restored and the file is writable +TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 + +cleanup diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t new file mode 100644 index 00000000000..2dddf9951fa --- /dev/null +++ b/tests/basic/halo-failover-enabled.t @@ -0,0 +1,87 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +# brick immediatelly, and md5s will show they are equal once +# the write completes. +# 4. The mount should also be RW after the brick is killed as +# quorum will be immediately restored by swapping in the +# other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-failover-enabled on +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 network.ping-timeout 20 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST [ -n "$KILL_IDX" ] +# NB: UP_CHILDREN is the set of children that should be up after we kill +# the brick indicated by KILL_IDX, *not* the set of children which are +# currently up! +UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g")) +UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)" +UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)" +VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)" + +# The victim brick should have a copy of the file. +TEST [ -n "$VICTIM_HAS_TEST" ] + +# Of the bricks which will remain standing, there should be only one +# brick which has the file called test. If the both have the first +# test file, the test is invalid as all the bricks are up and the +# halo-max-replicas is not being honored; e.g. bug exists. +ONLY_ONE=$((([ -z "$UP2_HAS_TEST" ] || [ -z "$UP1_HAS_TEST" ]) && + ([ -n "$UP2_HAS_TEST" ] || [ -n "$UP1_HAS_TEST" ])) && echo true) +TEST [ "x$ONLY_ONE" == "xtrue" ] + +echo "Failing child ${KILL_IDX}..." +TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX} + +# Test the mount is still RW (i.e. quorum works) +TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync + +# Calulate the MD5s +MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1) +MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1) + +# Verify the two up bricks have identical MD5s, if both are identical +# then we must have successfully failed-over to the brick which was +# previously proven to be down (via the ONLY_ONE test). +TEST [ "$MD5_UP1" == "$MD5_UP2" ] + +cleanup diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t new file mode 100644 index 00000000000..4574fdfe41e --- /dev/null +++ b/tests/basic/halo-hybrid.t @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Test for the Halo hybrid feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +# heal daemon is off to start. +# 2. Write some data +# 3. Verify hybrid code chose children for lookups +# 4. Verify hybrid code chose child for reads +# 5. Verify hybrid code wrote synchronously to all replicas +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function found_fuse_log_msg { + local dir="$1" + local msg="$2" + local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l) + if (( $cnt == 1 )); then + echo "Y" + else + echo "N" + fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-hybrid-mode True +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level TRACE +TEST $CLI volume start $V0 + +# Start a synchronous mount +TEST glusterfs --volfile-id=/$V0 \ + --xlator-option *replicate*.halo-max-latency=9999 \ + --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 +sleep 2 +cd $M0 + +TEST mkdir testdir +TEST cd testdir +for i in {1..5} +do + dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null +done +TEST ls -l + +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs" +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child" + +B0_CNT=$(ls $B0/${V0}0/testdir | wc -l) +B1_CNT=$(ls $B0/${V0}1/testdir | wc -l) +B2_CNT=$(ls $B0/${V0}2/testdir | wc -l) + +# Writes should be synchronous, all should have same +# file count +TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))" + +cleanup diff --git a/tests/basic/halo.t b/tests/basic/halo.t new file mode 100644 index 00000000000..25aca3442ab --- /dev/null +++ b/tests/basic/halo.t @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Test for the Halo geo-replication feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +# heal daemon is off to start. +# 2. Write some data +# 3. Verify at least one of the bricks did not receive the writes. +# 4. Turn the heal daemon on +# 5. Within 30 seconds the SHD should async heal the data over +# to the 3rd brick. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +for i in {1..5} +do + dd if=/dev/urandom of=f bs=1M count=1 2>/dev/null + mkdir a; cd a; +done + +B0_CNT=$(ls $B0/${V0}0 | wc -l) +B1_CNT=$(ls $B0/${V0}1 | wc -l) +B2_CNT=$(ls $B0/${V0}2 | wc -l) + +# One of the brick dirs should be empty +TEST "(($B0_CNT == 0 || $B1_CNT == 0 || $B2_CNT == 0))" + +# Ok, turn the heal daemon on and verify it heals it up +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 +cleanup diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t index 9df5cb45c3b..cd0189788ba 100755 --- a/tests/basic/mount-nfs-auth.t +++ b/tests/basic/mount-nfs-auth.t @@ -15,6 +15,9 @@ TEST glusterd TEST pidof glusterd TEST $CLI volume info +H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1) +H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}') + # Export variables for allow & deny EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" @@ -37,6 +40,10 @@ function build_dirs () { mkdir -p $B0/b{0,1,2}/L1/L2/L3 } +function export_allow_this_host_ipv6 () { + printf "$EXPORT_ALLOW6\n" > /var/lib/glusterd/nfs/exports +} + function export_allow_this_host () { printf "$EXPORT_ALLOW\n" > ${NFSDIR}/exports } @@ -186,6 +193,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available ## Mount NFS EXPECT "Y" check_mount_success $V0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + +## Mount NFS using the IPv6 export +export_allow_this_host_ipv6 +EXPECT "Y" check_mount_success $V0 ## Disallow host TEST export_deny_this_host diff --git a/tests/basic/write-behind.t b/tests/basic/write-behind.t new file mode 100644 index 00000000000..edad59786af --- /dev/null +++ b/tests/basic/write-behind.t @@ -0,0 +1,53 @@ +#!/bin/bash +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function clear_stats { + > /var/lib/glusterfs/stats/glusterfs_d_backends_${V0}0.dump +} + +function got_expected_write_count { + expected_size=$1 + expected_value=$2 + grep aggr.write_${expected_size} "/var/lib/glusterd/stats/glusterfsd__d_backends_${V0}0.dump" | grep $expected_value + if [ $? == 0 ]; then + echo "Y"; + else + echo "N"; + fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} + +# These are needed for our tracking of write sizes +TEST $CLI volume set $V0 diagnostics.latency-measurement on +TEST $CLI volume set $V0 diagnostics.count-fop-hits on +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 + +# Disable this in testing to get deterministic results +TEST $CLI volume set $V0 performance.write-behind-trickling-writes off + +TEST $CLI volume start $V0 + +sleep 2; + +TEST glusterfs -s $H0 --volfile-id $V0 $M0 + +# Write a 100MB file with a window-size 1MB, we should get 100 writes of 1MB each +TEST dd if=/dev/zero of=$M0/100mb_file bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "1mb" 100 + +TEST $CLI volume set $V0 performance.write-behind-window-size 512KB + +# Write a 100MB file with a window-size 512KB, we should get 200 writes of 512KB each +TEST dd if=/dev/zero of=$M0/100mb_file_2 bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "512kb" 200 + +cleanup; diff --git a/tests/bugs/distribute/bug-1099890.t b/tests/bugs/distribute/bug-1099890.t index 40f70d4938b..29ceccf2309 100644 --- a/tests/bugs/distribute/bug-1099890.t +++ b/tests/bugs/distribute/bug-1099890.t @@ -44,6 +44,8 @@ TEST $CLI volume set $V0 features.quota-deem-statfs on TEST $CLI volume quota $V0 limit-usage / 150MB; +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1 + TEST $CLI volume set $V0 cluster.min-free-disk 50% TEST glusterfs -s $H0 --volfile-id=$V0 $M0 diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t index c5a7f041ac8..8cf905a8f0b 100755 --- a/tests/bugs/distribute/bug-1161311.t +++ b/tests/bugs/distribute/bug-1161311.t @@ -53,8 +53,14 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0; TEST mkdir $M0/dir1 TEST mkdir -p $M0/dir2/dir3 -# Create a large file (1GB), so that rebalance takes time -dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 +# Create a large file (6.4 GB), so that rebalance takes time +# Reading from /dev/urandom is slow, so we'll cat it together +dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240 +for i in {1..10}; do + cat /tmp/FILE2 >> $M0/dir1/FILE2 +done + +#dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 # Rename the file to create a linkto, for rebalance to # act on the file diff --git a/tests/bugs/fuse/bug-858488-min-free-disk.t b/tests/bugs/fuse/bug-858488-min-free-disk.t index 635dc04d1e6..ab636575d3f 100644 --- a/tests/bugs/fuse/bug-858488-min-free-disk.t +++ b/tests/bugs/fuse/bug-858488-min-free-disk.t @@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2 ## Lets create volume TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}; +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1 ## Verify volume is created EXPECT "$V0" volinfo_field $V0 'Volume Name'; diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t index 9fc7ac3b845..3bc80ab9dab 100644 --- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t +++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t @@ -1,6 +1,6 @@ #!/bin/bash -## Test case for cluster.min-free-disk option validation. +## Test case for cluster.cluster.min-free-disk option validation. . $(dirname $0)/../../include.rc @@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2 TEST $CLI volume start $V0 ## Setting invalid value for option cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk 143.!/12 -TEST ! $CLI volume set $V0 min-free-disk 123% -TEST ! $CLI volume set $V0 min-free-disk 194.34% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12 +TEST ! $CLI volume set $V0 cluster.min-free-disk 123% +TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34% ## Setting fractional value as a size (unit is byte) for option ## cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk 199.051 -TEST ! $CLI volume set $V0 min-free-disk 111.999 +TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051 +TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999 ## Setting valid value for option cluster.min-free-disk should pass -TEST $CLI volume set $V0 min-free-disk 12% -TEST $CLI volume set $V0 min-free-disk 56.7% -TEST $CLI volume set $V0 min-free-disk 120 -TEST $CLI volume set $V0 min-free-disk 369.0000 +TEST $CLI volume set $V0 cluster.min-free-disk 12% +TEST $CLI volume set $V0 cluster.min-free-disk 56.7% +TEST $CLI volume set $V0 cluster.min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 369.0000 cleanup; diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t index c30d2b852d4..1b9ca18c08a 100755 --- a/tests/bugs/glusterd/bug-859927.t +++ b/tests/bugs/glusterd/bug-859927.t @@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " " TEST $CLI volume set $V0 min-free-inodes 60% EXPECT "60%" volume_option $V0 cluster.min-free-inodes -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk " " -TEST $CLI volume set $V0 min-free-disk 60% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk " " +TEST $CLI volume set $V0 cluster.min-free-disk 60% EXPECT "60%" volume_option $V0 cluster.min-free-disk -TEST $CLI volume set $V0 min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 120 EXPECT "120" volume_option $V0 cluster.min-free-disk TEST ! $CLI volume set $V0 frame-timeout "" diff --git a/tests/cluster.rc b/tests/cluster.rc index 467bbcb06e1..42547f09e37 100644 --- a/tests/cluster.rc +++ b/tests/cluster.rc @@ -46,17 +46,18 @@ function define_glusterds() { bopt="management.transport.socket.bind-address=${!h}"; popt="--pid-file=${!b}/glusterd.pid"; sopt="management.glusterd-sockfile=${!b}/glusterd/gd.sock" + aopt="*.transport.address-family=inet" #Get the logdir logdir=`gluster --print-logdir` #Fetch the testcases name and prefix the glusterd log with it logfile=`echo ${0##*/}`_glusterd$i.log lopt="--log-file=$logdir/$logfile" if [ "$2" == "-LDEBUG" ]; then - eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; - eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; + eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; + eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; else - eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; - eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; + eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; + eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; fi done } diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6 new file mode 100644 index 00000000000..426b1ef5705 --- /dev/null +++ b/tests/configfiles/exports-v6 @@ -0,0 +1 @@ +/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) diff --git a/tests/env.rc.in b/tests/env.rc.in index 82971c4a8de..87befc3711d 100644 --- a/tests/env.rc.in +++ b/tests/env.rc.in @@ -28,3 +28,6 @@ export PYTHON PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH export PYTHONPATH + +TESTER_CFLAGS="@TESTER_CFLAGS@" +export TESTER_CFLAGS diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t new file mode 100755 index 00000000000..4372998681f --- /dev/null +++ b/tests/features/brick-min-free-space.t @@ -0,0 +1,113 @@ +#!/bin/bash +# +# Test storage.min-free-disk option works. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd + +TEST truncate -s 16M $B0/brick0 +TEST LOOPDEV=$(losetup --find --show $B0/brick0) +TEST mkfs.xfs $LOOPDEV + +mkdir -p $B0/$V0 + +TEST mount -t xfs $LOOPDEV $B0/$V0 + +########### +# AIO on # +########### + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio on + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +############ +# AIO off # +############ + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +cleanup; diff --git a/tests/features/lock_revocation.t b/tests/features/lock_revocation.t new file mode 100644 index 00000000000..cbf21b71650 --- /dev/null +++ b/tests/features/lock_revocation.t @@ -0,0 +1,52 @@ +#!/bin/bash +logdir=$(gluster --print-logdir) +BRICK_LOGFILES="$logdir/bricks/d-backends-brick?.log" +rm -f $BRICK_LOGFILES &> /dev/null + +# Test that lock revocation works + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +cleanup; + +function deadlock_fop() { + local MNT=$1 + for i in {1..1000}; do + dd if=/dev/zero of=$MNT/testfile bs=1k count=10 &> /dev/null + if grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null; then + break + fi + done +} + +function monkey_unlock() { + grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null && echo SUCCESS + return 0 +} + +function append_to_file() { + local FILE_PATH=$1 + echo "hello" >> $FILE_PATH + return 0 +} + +#Init +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} +TEST $CLI volume set $V0 self-heal-daemon off +TEST $CLI volume set $V0 features.locks-monkey-unlocking on +TEST $CLI volume set $V0 features.locks-revocation-secs 2 +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=$V0 -s $H0 $M0; +TEST $GFS --volfile-id=$V0 -s $H0 $M1; + +# Deadlock writes to a file using monkey unlocking +deadlock_fop $M0 & +EXPECT_WITHIN 60 "SUCCESS" monkey_unlock + +# Sleep > unlock timeout and attempt to write to the file +sleep 3 +TEST append_to_file $M1/testfile + +cleanup diff --git a/tests/halo.rc b/tests/halo.rc new file mode 100644 index 00000000000..4cb7c81da85 --- /dev/null +++ b/tests/halo.rc @@ -0,0 +1,52 @@ +# Return the current Halo state of a given child (by index, i.e. 0 +# is first child). +function halo_child_state { + grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG | + tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//' +} + +# Return number of Halo children which are in a given state. +# First parameter is total # children. +# Second parameter is state to match (e.g. "UP"). +function halo_children_in_state { + local CHILD_COUNT=$1 + local SUM=0 + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then + SUM=$((SUM+1)) + fi + done + echo $SUM +} + +# Return number of up halo children, +# First parameter is total # children, +function halo_children_up { + echo $(halo_children_in_state $1 "UP") +} + +# Return number of down halo children, +# First parameter is total # children, +function halo_children_down { + echo $(halo_children_in_state $1 "DOWN") +} + +# Return number of up & down halo children. +# First parameter is total number of children. +function halo_sum_child_states { + local CHILD_COUNT=$1 + + local UP=0 + local DOWN=0 + + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + local STATE=$(halo_child_state $CHILD) + if [ x"$STATE" == x"UP" ]; then + UP=$((UP+1)) + elif [ x"$STATE" == x"DOWN" ]; then + DOWN=$((DOWN+1)) + fi + done + + echo "$UP $DOWN" +} diff --git a/tests/include.rc b/tests/include.rc index 492e35a7b6c..9f32e88f5f5 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -19,6 +19,8 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g CC=cc OSTYPE=$(uname -s) +M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point + ENV_RC=$(dirname $0)/../env.rc if [ ! -f $ENV_RC ]; then ENV_RC=$(dirname $0)/../../env.rc @@ -612,6 +614,7 @@ function build_tester () then cflags="$cflags $(pkg-config glusterfs-api --cflags-only-I --libs-only-L)" fi + cflags="$cflags ${TESTER_CFLAGS}" $CC -g -o $(dirname $cfile)/$execname $cfile $cflags } diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 903fbb39f12..bce94bb8b3b 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht ec +SUBDIRS = aha stripe afr dht ec CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f4d1d63c3d5..747577c9380 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -43,6 +43,13 @@ #include "afr-self-heald.h" #include "afr-messages.h" +#define CHILD_UP_STR "UP" +#define CHILD_DOWN_STR "DOWN" +#define CHILD_DISCONNECTED_STR "DOWN" + +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *fastest_children); + call_frame_t * afr_copy_frame (call_frame_t *base) { @@ -1371,21 +1378,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) sizeof(gfid_copy)) % child_count; } +/* + * afr_halo_read_subvol + * + * Given a array representing the readable children, this function will + * return which one of the readable children meet the halo hybrid criteria. + * In the event none are found, -1 is returned and another strategy will have + * to be used to figure out where the read should come from. + */ +int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) { + afr_private_t *priv = NULL; + unsigned char *hybrid_children; + int32_t hybrid_cnt = 0; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* Halo in-active or hybrid mode disabled, bail.... */ + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return -1; + + /* AFR Discovery edge case, if you are already pinned to a child + * which meets the latency threshold then go with this child for + * consistency purposes. + */ + if (priv->read_child >= 0 && readable[priv->read_child] && + priv->child_latency[priv->read_child] <= + AFR_HALO_HYBRID_LATENCY_MSEC) { + return priv->read_child; + } + + hybrid_children = alloca0 (priv->child_count); + hybrid_cnt = find_hybrid_children (this, hybrid_children); + if (hybrid_cnt) { + for (i = 0; i < priv->child_count; i++) { + if (readable[i] && hybrid_children[i]) { + read_subvol = i; + priv->read_child = read_subvol; + gf_log (this->name, GF_LOG_TRACE, + "Selected hybrid child %d for reads", + i); + break; + } + } + } + + return read_subvol; +} + int afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, unsigned char *readable, afr_read_subvol_args_t *args) { - int i = 0; - int read_subvol = -1; - afr_private_t *priv = NULL; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; afr_read_subvol_args_t local_args = {0,}; - priv = this->private; + priv = this->private; + + /* Choose lowest latency child for reads */ + read_subvol = afr_halo_read_subvol (this, readable); + if (read_subvol != -1) + return read_subvol; - /* first preference - explicitly specified or local subvolume */ - if (priv->read_child >= 0 && readable[priv->read_child]) + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) return priv->read_child; if (inode_is_linked (inode)) { @@ -1411,7 +1472,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, return -1; } - int afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, unsigned char *readable, int *event_p, @@ -2071,6 +2131,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); priv->read_child = child_index; + } else if (priv->halo_enabled) { + if (priv->read_child < 0) { + priv->read_child = child_index; + } else if (priv->child_latency[child_index] < + priv->child_latency[priv->read_child]) { + priv->read_child = child_index; + } } out: STACK_DESTROY(frame->root); @@ -2262,7 +2329,6 @@ unwind: return 0; } - int afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { @@ -2488,6 +2554,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) afr_local_t *local = NULL; afr_private_t *priv = NULL; int call_count = 0; + unsigned char *hybrid_children = NULL; local = frame->local; priv = this->private; @@ -2498,8 +2565,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) goto out; } - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); + hybrid_children = alloca0 (priv->child_count); + call_count = find_hybrid_children (this, hybrid_children); + if (call_count) { + for (i = 0; i < priv->child_count; i++) + local->child_up[i] = hybrid_children[i]; + gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid " + "children for LOOKUPs", call_count); + } else { + hybrid_children = NULL; + call_count = AFR_COUNT (local->child_up, priv->child_count); + } + + local->call_count = call_count; ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, &local->loc); @@ -2732,6 +2810,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get (loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); + /* So this is the "secret" to why "Hybrid" halo works. Encoded in + * the cached inodes, we store what is effectively the "generational" + * state of the cluster along with a "packed" version of the extended + * attributes which determine which nodes are wise/fools. We can + * consult these cached values to figure out who we can trust, in the + * event the state of our cluster changes and we can no longer trust + * the cached info we "refresh" the inode (and hit all regions) to + * ensure we know which bricks we can safely read from. + */ if (event != local->event_generation) afr_inode_refresh (frame, this, loc->parent, NULL, afr_lookup_do); @@ -2956,7 +3043,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4204,25 +4291,569 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +/* + * afr_cmp_child + * + * Passed to the qsort function to order a list of children by the latency + * and/or up/down states. + * + * Note: This isn't as simple as taking the latencies and calling it a + * a day. Children can be marked down, which overrides their latency + * signal. Having a lower-latency child available doesn't guarentee this + * child shall be marked up: we don't want to constantly be swapping + * slightly better bricks for others...this is jarring to clients and + * could cause all sorts of issues. Plus, the fail-over, max-replicas + * flags must all be honored which manage the up/down state of children. + * + * In short, the (as marked) up/down down state of the brick shall always + * take precedence when sorting by latency. + */ +static int +_afr_cmp_child (const void *child1, const void *child2) +{ + struct afr_child *child11 = (struct afr_child *)child1; + struct afr_child *child22 = (struct afr_child *)child2; + + /* If both children are _marked_ down they are equal */ + if (!child11->child_up && !child22->child_up) + return 0; + + /* Prefer child 2, child 1 is _marked_ down, child 2 is not */ + if (!child11->child_up && child22->child_up) + return 1; + + /* Prefer child 1, child 2 is _marked_ down, child 1 is not */ + if (child11->child_up && !child22->child_up) + return -1; + + if (child11->latency > child22->latency) { + return 1; + } + if (child11->latency == child22->latency) { + return 0; + } + return -1; +} + +/* + * find_hybrid_children + * + * Given a char array representing our children (aka bricks within our AFR + * AFR "subvolume"), we'll mark this array with the children which are + * within the halo_hybrid_read_max_latency_sec or if none fit this condition, + * we'll pick the fastest two bricks. + * + * You might ask, why not just pick the quickest brick and be done with it? + * Well, being within our set is not suffcient to be chosen for the read, + * we must also be marked "readable", we still want to choose as many as + * we can within our local region to ensure we have somebody that is readable. + * + * To illustrate this, consider the case where a 1/2 bricks received a sync + * from some other writer, and the 2nd brick although faster wasn't present. + * In this case we'll want to use the slower brick to service the read. + * + * In short, this function just tells the caller which hybrid children, + * it gives no signal as to their readability, nor should it since this is + * handled later in the various flows (e.g. by afr_halo_read_subvol). + */ +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *hybrid_children) +{ + int32_t i = 0; + afr_private_t *priv = NULL; + struct afr_child *sorted_list = NULL; + uint32_t max_latency; + uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT; + + priv = this->private; + + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return 0; + + if (limit > priv->child_count) + limit = priv->child_count; + + max_latency = priv->halo_hybrid_read_max_latency_msec; + + sorted_list = alloca (sizeof (struct afr_child) * priv->child_count); + + /* Find children meeting the latency threshold */ + for (i = 0; i < priv->child_count; i++) { + sorted_list[i].idx = i; + sorted_list[i].child_up = priv->child_up[i]; + sorted_list[i].latency = priv->child_latency[i]; + } + + /* QuickSort the children according to latency */ + qsort (sorted_list, priv->child_count, sizeof (struct afr_child), + _afr_cmp_child); + + i = 0; + while (i < priv->child_count && sorted_list[i].latency <= max_latency) + hybrid_children[sorted_list[i++].idx] = 1; + + /* Found some candidates */ + if (i != 0) + return i; + + /* If no candidates can be found meeting the max_latency threshold + * then find the best of those we have to our limit. + */ + for (i = 0; i < limit; i++) + hybrid_children[sorted_list[i].idx] = 1; + + return i; +} + +int +find_best_down_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) " + "@ %ld ms latency", best_child, best_latency); + } + return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] >= worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)" + " @ %ld ms latency", worst_child, worst_latency); + } + return worst_child; +} + +static const char *halo_state_str(int i) +{ + switch (i) { + case 0: return "DOWN"; + case 1: return "UP"; + } + + return "unknown"; +} + + +static void dump_halo_states (xlator_t *this) { + afr_private_t *priv = NULL; + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (N/A)", + i, + halo_state_str(priv->child_up[i])); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (%"PRIi64" ms)", + i, + halo_state_str(priv->child_up[i]), + priv->child_latency[i]); + } + } +} + +static void +_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, + const int idx, const int64_t halo_max_latency_msec, + int32_t *event, int64_t *child_latency_msec, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int best_down_child = 0; + uint64_t latency_samples = 0; + + priv = this->private; + + /* Base it off the _minimum_ latency we've ever seen */ + *child_latency_msec = child_xlator->client_latency.min / 1000.0; + latency_samples = child_xlator->client_latency.count; + priv->child_latency[idx] = *child_latency_msec; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] == 1) { + up_children++; + } + } + + /* Don't do anything until you have some minimum numbner of + * latency samples */ + if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) { + gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient " + " number of latency samples (%" PRIu64 + " < %d), halo in-active.", + latency_samples, priv->halo_min_samples); + } + + gf_log (this->name, GF_LOG_DEBUG, + "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)" + " up_count %d (min %d) enabled %s", + idx, child_xlator ? child_xlator->name : "<null>", + *child_latency_msec, + halo_max_latency_msec, + up_children, + priv->halo_min_replicas, + child_halo_enabled ? "true" : "false"); + + /* + * Case 1: This child's latency exceeds the maximum allowable + * for this halo. + */ + if (child_halo_enabled && + *child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && + up_children > priv->halo_min_replicas) { + if (find_worst_up_child (this) == idx) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "marking child down, " + "min_replicas (%d) still " + "satisfied.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_min_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + /* + * Case 2: Child latency is within halo and currently marked down, + * mark it up. + */ + } else if ((child_halo_enabled == _gf_false || + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 0) { + if (child_halo_enabled == _gf_false || + up_children < priv->halo_max_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "below halo threshold (%ld) or halo is " + "disabled, marking child up.", + *child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_UP; + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", idx, + priv->halo_max_replicas); + } + /* + * Case 3: Child latency is within halo,and currently marked up, + * mark it down if it's the highest latency child and the + * number of up children is greater than halo_max_replicas. + * UNLESS you are an SHD in which case do nothing. + */ + } else if ((child_halo_enabled == _gf_true && + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 1) { + if (find_worst_up_child (this) == idx && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "but halo_max_replicas (%d) exceeded, " + "marking child down.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_max_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + } + + if (*event != GF_EVENT_CHILD_PING && + gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, + int32_t *event, int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int worst_up_child = -1; + gf_boolean_t was_down = _gf_false; + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + /* + * Track the fact we did this, we may need to repeal this + * if we later decide to mark this brick down. + */ + was_down = _gf_true; + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (child_halo_enabled == _gf_true && + up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > + halo_max_latency_msec) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + priv->child_up[worst_up_child] = 0; + up_children--; + gf_log (this->name, GF_LOG_DEBUG, + "Marking child %d down, " + "doesn't meet halo threshold " + "(%ld), and > " + "halo_min_replicas (%d)", + worst_up_child, + halo_max_latency_msec, + priv->halo_min_replicas); + goto out; + } + } + if (priv->halo_enabled && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + worst_up_child = find_worst_up_child (this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + gf_log (this->name, GF_LOG_INFO, + "Marking child %d down, " + "up_children (%d) > " + "halo_max_replicas (%d)", + worst_up_child, + up_children, + priv->halo_max_replicas); + up_children--; + goto out; + } +out: + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, + int idx, int64_t child_latency_msec, + int64_t halo_max_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + gf_boolean_t swap_child = _gf_false; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to + * indicate the child is really disconnected. + */ + if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) { + priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY; + } + priv->child_up[idx] = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + best_down_child = find_best_down_child (this); + if (child_halo_enabled == _gf_true) { + if (up_children < priv->halo_min_replicas && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + else if (up_children < priv->halo_max_replicas && + priv->child_latency[best_down_child] <= + halo_max_latency_msec && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + } + + if (swap_child) { + if (best_down_child >= 0) { + gf_log (this->name, GF_LOG_INFO, + "Swapping out child %d for " + "child %d to satisfy " + "halo_min_replicas (%d).", + idx, best_down_child, + priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going " + "offline until atleast one of them " + "comes back up."); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +int64_t +_afr_get_halo_latency (xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = + priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld", + halo_max_latency_msec); + return halo_max_latency_msec; +} + + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) { + xlator_t *child_xlator = NULL; afr_private_t *priv = NULL; int i = -1; - int up_children = 0; - int down_children = 0; int propagate = 0; int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; int call_psh = 0; + int up_child = -1; + uint64_t latency_samples = 0; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY; + gf_boolean_t child_halo_enabled = _gf_false; + child_xlator = (xlator_t *)data; priv = this->private; if (!priv) @@ -4235,7 +4866,7 @@ afr_notify (xlator_t *this, int32_t event, * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. */ priv->did_discovery = _gf_false; - + latency_samples = child_xlator->client_latency.count; /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has @@ -4246,7 +4877,7 @@ afr_notify (xlator_t *this, int32_t event, * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index (this, data); + idx = find_child_index (this, child_xlator); if (idx < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -4255,6 +4886,28 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); + + if (!priv->halo_enabled || + latency_samples < priv->halo_min_samples) { + child_halo_enabled = _gf_false; + halo_max_latency_msec = INT64_MAX; + } else { + child_halo_enabled = _gf_true; + halo_max_latency_msec = _afr_get_halo_latency (this); + } + + if (event == GF_EVENT_CHILD_PING) { + /* Calculates the child latency and sets event + */ + LOCK (&priv->lock); + { + _afr_handle_ping_event (this, child_xlator, idx, + halo_max_latency_msec, &event, + &child_latency_msec, child_halo_enabled); + } + UNLOCK (&priv->lock); + } + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { @@ -4281,52 +4934,16 @@ afr_notify (xlator_t *this, int32_t event, propagate = 1; break; case GF_EVENT_CHILD_UP: - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_children = __afr_get_up_children_count (priv); - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - } else { - event = GF_EVENT_CHILD_MODIFIED; - } - - priv->last_event[idx] = event; - + _afr_handle_child_up_event (this, child_xlator, + idx, halo_max_latency_msec, &event, &call_psh, + &up_child, child_halo_enabled); break; case GF_EVENT_CHILD_DOWN: - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_ALL_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - } else { - event = GF_EVENT_SOME_CHILD_DOWN; - } - - priv->last_event[idx] = event; - + _afr_handle_child_down_event (this, child_xlator, idx, + child_latency_msec, halo_max_latency_msec, + &event, &call_psh, &up_child, + child_halo_enabled); break; case GF_EVENT_CHILD_CONNECTING: @@ -4353,7 +4970,6 @@ afr_notify (xlator_t *this, int32_t event, had come up, propagate CHILD_UP, but only this time */ event = GF_EVENT_CHILD_DOWN; - up_children = __afr_get_up_children_count (priv); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7f7962013d7..c7d6261b110 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -46,7 +46,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_spbc_timeout_t, gf_afr_mt_spb_status_t, gf_afr_mt_empty_brick_t, - gf_afr_mt_end + gf_afr_mt_child_latency_t, + gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 4becfb835e8..87542799a5b 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -371,7 +371,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; off_t off = 0; - size_t block = 128 * 1024; + size_t block = 0; int type = AFR_SELFHEAL_DATA_FULL; int ret = -1; call_frame_t *iter_frame = NULL; @@ -383,6 +383,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, healed_sinks[ARBITER_BRICK_INDEX] = 0; } + block = 128 * 1024 * priv->data_self_heal_window_size; + type = afr_data_self_heal_type_get (priv, healed_sinks, source, replies); diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c6ac5ebfd1b..4ac1d32f58a 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -58,6 +58,7 @@ typedef struct { eh_t **statistics; uint32_t max_threads; uint32_t wait_qlength; + uint32_t halo_max_latency_msec; } afr_self_heald_t; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 6f4783c9213..ae9b28c7fb4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -176,6 +176,42 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); + GF_OPTION_RECONF ("halo-enabled", + priv->halo_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-failover-enabled", + priv->halo_failover_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-shd-max-latency", + priv->shd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, + options, uint32, out); + + GF_OPTION_RECONF ("halo-hybrid-mode", + priv->halo_hybrid_mode, options, bool, + out); + + GF_OPTION_RECONF ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, options, + uint32, out); + + GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options, + uint32, out); + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -396,6 +432,35 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT ("halo-hybrid-mode", + priv->halo_hybrid_mode, bool, out); + + GF_OPTION_INIT ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, uint32, + out); + + GF_OPTION_INIT ("halo-enabled", + priv->halo_enabled, bool, out); + + GF_OPTION_INIT ("halo-failover-enabled", + priv->halo_failover_enabled, bool, out); + + GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32, + out); + + GF_OPTION_INIT ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, uint32, out); + + GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -445,17 +510,24 @@ init (xlator_t *this) priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up) { + + priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency), + child_count, + gf_afr_mt_child_latency_t); + + if (!priv->child_up || !priv->child_latency) { ret = -ENOMEM; goto out; } - for (i = 0; i < child_count; i++) + for (i = 0; i < child_count; i++) { + priv->child_latency[i] = 0.0; priv->child_up[i] = -1; /* start with unknown state. this initialization needed for afr_notify() to work reliably */ + } priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, gf_afr_mt_xlator_t); @@ -663,6 +735,85 @@ struct volume_options options[] = { "jobs that can perform parallel heals in the " "background." }, + { .key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "Maximum latency for shd halo replication in msec." + }, + { .key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable Halo (geo) replication mode." + }, + { .key = {"halo-failover-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable x-halo failover: will allow failover " + "to bricks outside the client or daemons' halo " + "in an attempt to satisfy halo-min-replicas." + }, + { .key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for nfsd halo replication in msec." + }, + { .key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for halo replication in msec." + }, + { .key = {"halo-hybrid-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable hybrid sync mounts. When enabled, halo will " + "do write FOPs synchronously, and read FOPs will be " + "services in-region if the inode is clean/consistent." + "If no bricks can be found below " + "halo-hybrid-max-read-latency then the best 2 shall " + "be selected. This option can be used in " + "conjunction with all other halo options." + }, + { .key = {"halo-hybrid-read-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "8", + .description = "Maximum latency hybrid mode will use to select " + "children for read FOPs. Don't tune this unless " + "you really know what you are doing (i.e. you've " + "read/understand the associated source code)." + }, + { .key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD." + }, + { .key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .description = "The minimum number of halo replicas, before adding " + "out of region replicas." + }, + { .key = {"halo-min-samples"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "3", + .description = "The minimum number of halo latency samples, before " + "we start forming the halos." + }, { .key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -802,6 +953,13 @@ struct volume_options options[] = { "translator is running as part of self-heal-daemon " "or not." }, + { .key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not." + }, { .key = {"quorum-type"}, .type = GF_OPTION_TYPE_STR, .value = { "none", "auto", "fixed"}, @@ -866,7 +1024,7 @@ struct volume_options options[] = { }, { .key = {"heal-timeout"}, .type = GF_OPTION_TYPE_INT, - .min = 60, + .min = 5, .max = INT_MAX, .default_value = "600", .description = "time interval for checking the need to self-heal " diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4bffc30788a..d09aa6852c8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,9 @@ #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) +#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */ +#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */ +#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ @@ -60,6 +63,17 @@ typedef enum { AFR_FAV_CHILD_POLICY_MAX, } afr_favorite_child_policy; +struct afr_nfsd { + gf_boolean_t iamnfsd; + uint32_t halo_max_latency_msec; +}; + +struct afr_child { + uint32_t idx; + int64_t latency; + unsigned char child_up; +}; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -71,6 +85,7 @@ typedef struct _afr_private { inode_t *root_inode; unsigned char *child_up; + int64_t *child_latency; unsigned char *local; char **pending_key; @@ -141,8 +156,19 @@ typedef struct _afr_private { gf_boolean_t ensure_durability; char *sh_domain; char *afr_dirty; - - afr_self_heald_t shd; + gf_boolean_t halo_enabled; + + /* Halo geo-replication tunables */ + gf_boolean_t halo_failover_enabled; + gf_boolean_t halo_hybrid_mode; + uint32_t halo_hybrid_read_max_latency_msec; + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; + uint32_t halo_min_samples; + + afr_self_heald_t shd; + struct afr_nfsd nfsd; gf_boolean_t consistent_metadata; uint64_t spb_choice_timeout; diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/cluster/aha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am new file mode 100644 index 00000000000..006db127d28 --- /dev/null +++ b/xlators/cluster/aha/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = aha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +aha_la_LDFLAGS = -module -avoid-version + +aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c +aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c new file mode 100644 index 00000000000..3b2ca641de2 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.c @@ -0,0 +1,952 @@ +#include "aha-fops.h" + +static void +__save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + list_add_tail (&fop->list, &conf->failed); +} + +void +save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __save_fop (fop, conf); + } + UNLOCK (&conf->lock); +} + +#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...) \ + do { \ + struct aha_fop *fop = aha_fop_new (); \ + if (!fop) { \ + gf_log (GF_AHA, GF_LOG_CRITICAL, \ + "Allocation failed, terminating " \ + "to prevent a hung mount."); \ + assert (0); \ + } \ + fop->stub = fop_##type##_stub (frame, aha_##type, \ + args); \ + fop->frame = frame; \ + frame->local = fop; \ + STACK_WIND (frame, cbk, obj, fn, args); \ + } while (0) \ + +/* + * AHA_HANDLE_FOP_CBK + * + * 1) If the error returned is ENOTCONN *and* the timer that waits + * for the server to come back has not expired, store the fop to retry later. + * 2) If the timer waiting for the server has expired, just unwind. + * 3) If the error returned is something other than ENOTCONN, just unwind. + * + */ +#define AHA_HANDLE_FOP_CBK(type, frame, args ...) \ + do { \ + struct aha_conf *conf = frame->this->private; \ + struct aha_fop *fop = frame->local; \ + if (op_ret != 0 && op_errno == ENOTCONN && \ + !aha_is_timer_expired (conf)) { \ + gf_log (GF_AHA, GF_LOG_WARNING, \ + "Got ENOTCONN from client, storing " \ + "to retry later!"); \ + save_fop (fop, conf); \ + } else { \ + AHA_DESTROY_LOCAL (frame); \ + STACK_UNWIND_STRICT (type, frame, args); \ + } \ + } while (0) \ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode, + buf, xdata, postparent); + return 0; +} + + +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + loc, xdata); + return 0; +} + + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, stat, aha_stat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, + loc, xdata); + return 0; +} + + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} + + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t mask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, access, aha_access_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->access, + loc, mask, xdata); + return 0; +} + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno, + path, sbuf, xdata); + return 0; +} + + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + size_t size, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readlink, + loc, size, xdata); + return 0; +} + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; +} + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->unlink, + loc, xflag, xdata); + return 0; +} + + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rmdir, + loc, flags, xdata); + return 0; +} + + +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->symlink, + linkpath, loc, umask, xdata); + return 0; +} + + +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf, + preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + return 0; +} + + +int +aha_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rename, aha_rename_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, link, aha_link_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, create, aha_create_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + + +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, open, aha_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno, + vector, count, stbuf, iobref, xdata); + return 0; +} + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readv, aha_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, + off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, writev, aha_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + return 0; +} + + +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, flush, aha_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, + fd, xdata); + return 0; +} + + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, flags, xdata); + return 0; +} + + +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, + fd, xdata); + return 0; +} + + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->opendir, + loc, fd, xdata); + return 0; +} + +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsyncdir, + fd, flags, xdata); + return 0; +} + + +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->statfs, + loc, xdata); + return 0; +} + + + +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + + +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->getxattr, + loc, name, xdata); + return 0; +} + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->xattrop, + loc, flags, dict, xdata); + return 0; +} + + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fxattrop, + fd, flags, dict, xdata); + return 0; +} + + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata); + return 0; +} + + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lk, aha_lk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lk, + fd, cmd, lock, xdata); + return 0; +} + + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->inodelk, + volume, loc, cmd, lock, xdata); + return 0; +} + + +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->finodelk, + volume, fd, cmd, lock, xdata); + return 0; +} + + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + return 0; +} + + +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + return 0; +} + +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdir, + fd, size, off, xdata); + return 0; +} + + +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdirp, + fd, size, off, dict); + return 0; +} diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h new file mode 100644 index 00000000000..b1fb9d38a80 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.h @@ -0,0 +1,360 @@ +#ifndef _AHA_FOPS_H +#define _AHA_FOPS_H + +#include "aha.h" +#include "aha-helpers.h" + +/* FOP functions */ +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata); + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata); + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); + +int +aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata); + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata); + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata); + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, + dict_t *xdata); + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +/* Callback functions */ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent); + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata); + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata); +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata); + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata); +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata); +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata); +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); + +#endif /* _AHA_FOPS_H */ diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c new file mode 100644 index 00000000000..e3b713688d3 --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.c @@ -0,0 +1,46 @@ +#include "aha-helpers.h" + +struct aha_conf *aha_conf_new () +{ + struct aha_conf *conf = NULL; + + conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf); + if (!conf) + goto err; + + INIT_LIST_HEAD (&conf->failed); + + LOCK_INIT (&conf->lock); +err: + return conf; +} + +void aha_conf_destroy (struct aha_conf *conf) +{ + LOCK_DESTROY (&conf->lock); + GF_FREE (conf); +} + +struct aha_fop *aha_fop_new () +{ + struct aha_fop *fop = NULL; + + fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop); + if (!fop) + goto err; + + INIT_LIST_HEAD (&fop->list); + +err: + return fop; +} + +void aha_fop_destroy (struct aha_fop *fop) +{ + if (!fop) + return; + + call_stub_destroy (fop->stub); + fop->stub = NULL; + GF_FREE (fop); +} diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h new file mode 100644 index 00000000000..d9cf9b3295d --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.h @@ -0,0 +1,23 @@ +#ifndef _AHA_HELPERS_H +#define _AHA_HELPERS_H + +#include "aha.h" + +#define GF_AHA "aha" + +struct aha_conf *aha_conf_new (); + +void aha_conf_destroy (struct aha_conf *conf); + +struct aha_fop *aha_fop_new (); + +void aha_fop_destroy (struct aha_fop *fop); + +#define AHA_DESTROY_LOCAL(frame) \ + do { \ + struct aha_fop *fop = frame->local; \ + aha_fop_destroy (fop); \ + frame->local = NULL; \ + } while (0) \ + +#endif /* _AHA_HELPERS_H */ diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h new file mode 100644 index 00000000000..117dda27e8b --- /dev/null +++ b/xlators/cluster/aha/src/aha-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __AHA_MEM_TYPES_H__ +#define __AHA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_aha_mem_types_ { + gf_aha_mt_begin_t = gf_common_mt_end + 1, + gf_aha_mt_conf, + gf_aha_mt_fop, + gf_aha_mt_end +}; +#endif diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c new file mode 100644 index 00000000000..8810f913f42 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.c @@ -0,0 +1,524 @@ +#include "aha.h" +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" + +/* + * AHA_RETRY_FOP: + * + * - We STACK_WIND the fop using the arguments in the call_stub. + * We use STACK_WIND because we need a *new* frame, since we already + * exhausted the existing frame with the original STACK_WIND. + * + * - After STACK_WIND completes, we can destroy this frame's local (which + * should be struct aha_fop *). The frame itself will get destroyed higher in + * the xlator graph, since its still part of the call stack. + */ +#define AHA_RETRY_FOP(fop, type, args ...) \ + do { \ + call_stub_t *stub = fop->stub; \ + call_frame_t *frame = fop->frame; \ + xlator_t *this = frame->this; \ + STACK_WIND (frame, aha_##type##_cbk, this, \ + this->fops->type, args); \ + AHA_DESTROY_LOCAL (frame); \ + } while (0) \ + +#define AHA_UNWIND_FOP(fop, type) \ + do { \ + call_frame_t *frame = fop->frame; \ + AHA_DESTROY_LOCAL (frame); \ + default_##type##_failure_cbk (frame, ETIMEDOUT); \ + } while (0) \ + +void +__aha_retry_force_unwind_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_force_unwind_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Force-unwound %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + +void +aha_force_unwind_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_force_unwind_fops (conf); + } + UNLOCK (&conf->lock); +} + +void +__aha_retry_failed_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Skip if the child is not up + */ + if (!conf->child_up) { + gf_log (GF_AHA, GF_LOG_WARNING, + "Waiting for child to come up before retrying."); + return; + } + + /* + * Skip if the the queue is empty. + */ + if (list_empty (&conf->failed)) { + gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry."); + } + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_retry_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Drained %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + + +void +aha_retry_failed_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_failed_fops (conf); + } + UNLOCK (&conf->lock); +} + +void aha_retry_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_CREATE: + AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags, + stub->args.mode, stub->args.umask, + stub->args.fd, + stub->args.xdata); + break; + + case GF_FOP_STAT: + AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READLINK: + AHA_RETRY_FOP (fop, readlink, &stub->args.loc, + stub->args.size, stub->args.xdata); + break; + + case GF_FOP_MKNOD: + AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode, + stub->args.rdev, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_MKDIR: + AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode, + stub->args.umask, stub->args.xdata); + break; + + case GF_FOP_UNLINK: + AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag, + stub->args.xdata); + break; + + case GF_FOP_RMDIR: + AHA_RETRY_FOP (fop, rmdir, &stub->args.loc, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_SYMLINK: + AHA_RETRY_FOP (fop, symlink, stub->args.linkname, + &stub->args.loc, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_RENAME: + AHA_RETRY_FOP (fop, rename, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_LINK: + AHA_RETRY_FOP (fop, link, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_TRUNCATE: + AHA_RETRY_FOP (fop, truncate, &stub->args.loc, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READ: + AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_WRITE: + AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector, + stub->args.count, stub->args.offset, + stub->args.flags, stub->args.iobref, + stub->args.xdata); + break; + + case GF_FOP_STATFS: + AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_FLUSH: + AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNC: + AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync, + stub->args.xdata); + break; + + case GF_FOP_SETXATTR: + AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_GETXATTR: + AHA_RETRY_FOP (fop, getxattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FSETXATTR: + AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd, + stub->args.xattr, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_FGETXATTR: + AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_REMOVEXATTR: + AHA_RETRY_FOP (fop, removexattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_OPENDIR: + AHA_RETRY_FOP (fop, opendir, &stub->args.loc, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNCDIR: + AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd, + stub->args.datasync, stub->args.xdata); + break; + + case GF_FOP_ACCESS: + AHA_RETRY_FOP (fop, access, &stub->args.loc, + stub->args.mask, stub->args.xdata); + break; + + case GF_FOP_FTRUNCATE: + AHA_RETRY_FOP (fop, ftruncate, stub->args.fd, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_FSTAT: + AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_LK: + AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_INODELK: + AHA_RETRY_FOP (fop, inodelk, stub->args.volume, + &stub->args.loc, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_FINODELK: + AHA_RETRY_FOP (fop, finodelk, stub->args.volume, + stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_ENTRYLK: + AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_FENTRYLK: + AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_LOOKUP: + AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READDIR: + AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READDIRP: + AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_XATTROP: + AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_FXATTROP: + AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_SETATTR: + AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + case GF_FOP_FSETATTR: + AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + default: + /* Some fops are not implemented yet: + * + * GF_FOP_NULL + * GF_FOP_RCHECKSUM + * GF_FOP_FORGET + * GF_FOP_RELEASE + * GF_FOP_RELEASEDIR + * GF_FOP_GETSPEC + * GF_FOP_FALLOCATE + * GF_FOP_DISCARD + * GF_FOP_ZEROFILL + * GF_FOP_MAXVALUE + * + */ + gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s", + gf_fop_list[stub->fop]); + assert (0); + break; + } +} + +void +aha_force_unwind_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_UNWIND_FOP (fop, open); + break; + + case GF_FOP_CREATE: + AHA_UNWIND_FOP (fop, create); + break; + + case GF_FOP_STAT: + AHA_UNWIND_FOP (fop, stat); + break; + + case GF_FOP_READLINK: + AHA_UNWIND_FOP (fop, readlink); + break; + + case GF_FOP_MKNOD: + AHA_UNWIND_FOP (fop, mknod); + break; + + case GF_FOP_MKDIR: + AHA_UNWIND_FOP (fop, mkdir); + break; + + case GF_FOP_UNLINK: + AHA_UNWIND_FOP (fop, unlink); + break; + + case GF_FOP_RMDIR: + AHA_UNWIND_FOP (fop, rmdir); + break; + + case GF_FOP_SYMLINK: + AHA_UNWIND_FOP (fop, symlink); + break; + + case GF_FOP_RENAME: + AHA_UNWIND_FOP (fop, rename); + break; + + case GF_FOP_LINK: + AHA_UNWIND_FOP (fop, link); + break; + + case GF_FOP_TRUNCATE: + AHA_UNWIND_FOP (fop, truncate); + break; + + case GF_FOP_READ: + AHA_UNWIND_FOP (fop, readv); + break; + + case GF_FOP_WRITE: + AHA_UNWIND_FOP (fop, writev); + break; + + case GF_FOP_STATFS: + AHA_UNWIND_FOP (fop, statfs); + break; + + case GF_FOP_FLUSH: + AHA_UNWIND_FOP (fop, flush); + break; + + case GF_FOP_FSYNC: + AHA_UNWIND_FOP (fop, fsync); + break; + + case GF_FOP_SETXATTR: + AHA_UNWIND_FOP (fop, setxattr); + break; + + case GF_FOP_GETXATTR: + AHA_UNWIND_FOP (fop, getxattr); + break; + + case GF_FOP_FSETXATTR: + AHA_UNWIND_FOP (fop, fsetxattr); + break; + + case GF_FOP_FGETXATTR: + AHA_UNWIND_FOP (fop, fgetxattr); + break; + + case GF_FOP_REMOVEXATTR: + AHA_UNWIND_FOP (fop, removexattr); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_UNWIND_FOP (fop, fremovexattr); + break; + + case GF_FOP_OPENDIR: + AHA_UNWIND_FOP (fop, opendir); + break; + + case GF_FOP_FSYNCDIR: + AHA_UNWIND_FOP (fop, fsyncdir); + break; + + case GF_FOP_ACCESS: + AHA_UNWIND_FOP (fop, access); + break; + + case GF_FOP_FTRUNCATE: + AHA_UNWIND_FOP (fop, ftruncate); + break; + + case GF_FOP_FSTAT: + AHA_UNWIND_FOP (fop, fstat); + break; + + case GF_FOP_LK: + AHA_UNWIND_FOP (fop, lk); + break; + + case GF_FOP_INODELK: + AHA_UNWIND_FOP (fop, inodelk); + break; + + case GF_FOP_FINODELK: + AHA_UNWIND_FOP (fop, finodelk); + break; + + case GF_FOP_ENTRYLK: + AHA_UNWIND_FOP (fop, entrylk); + break; + + case GF_FOP_FENTRYLK: + AHA_UNWIND_FOP (fop, fentrylk); + break; + + case GF_FOP_LOOKUP: + AHA_UNWIND_FOP (fop, lookup); + break; + + case GF_FOP_READDIR: + AHA_UNWIND_FOP (fop, readdir); + break; + + case GF_FOP_READDIRP: + AHA_UNWIND_FOP (fop, readdirp); + break; + + case GF_FOP_XATTROP: + AHA_UNWIND_FOP (fop, xattrop); + break; + + case GF_FOP_FXATTROP: + AHA_UNWIND_FOP (fop, fxattrop); + break; + + case GF_FOP_SETATTR: + AHA_UNWIND_FOP (fop, setattr); + break; + + case GF_FOP_FSETATTR: + AHA_UNWIND_FOP (fop, fsetattr); + break; + + default: + /* Some fops are not implemented yet, + * and this would never happen cause we wouldn't + * queue them (see the assert statement in aha_retry_fop()) + */ + break; + } +} diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h new file mode 100644 index 00000000000..5c8f56bca97 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.h @@ -0,0 +1,12 @@ +#ifndef _AHA_RETRY_H +#define _AHA_RETRY_H + +void aha_retry_failed_fops (struct aha_conf *conf); + +void aha_retry_fop (struct aha_fop *fop); + +void aha_force_unwind_fops (struct aha_conf *conf); + +void aha_force_unwind_fop (struct aha_fop *fop); + +#endif /* _AHA_RETRY_H */ diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c new file mode 100644 index 00000000000..2135e47f37f --- /dev/null +++ b/xlators/cluster/aha/src/aha.c @@ -0,0 +1,345 @@ +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" +#include "aha.h" + +#include "syncop.h" + + +int +retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg) +{ + /* Nothing to do here ... */ + return 0; +} + +int +retry_failed_fops (void *arg) +{ + xlator_t *this = NULL; + + struct aha_conf *conf = NULL; + + this = arg; + conf = this->private; + + aha_retry_failed_fops (conf); + + return 0; +} + +void +dispatch_fop_queue_drain (xlator_t *this) +{ + struct syncenv *env = NULL; + int ret = 0; + + env = this->ctx->env; + + ret = synctask_new (env, retry_failed_fops, + retry_failed_fops_cbk, NULL, this); + if (ret != 0) { + gf_log (GF_AHA, GF_LOG_CRITICAL, + "Failed to dispatch synctask " + "to drain fop queue!"); + } +} + +inline void +__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired) +{ + conf->timer_expired = expired; +} + +inline gf_boolean_t +__aha_is_timer_expired (struct aha_conf *conf) +{ + return conf->timer_expired; +} + +gf_boolean_t +aha_is_timer_expired (struct aha_conf *conf) +{ + gf_boolean_t expired = _gf_false; + + LOCK (&conf->lock); + { + expired = __aha_is_timer_expired (conf); + } + UNLOCK (&conf->lock); + + return expired; +} + +void +aha_child_down_timer_expired (void *data) +{ + struct aha_conf *conf = NULL; + + conf = data; + + gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!"); + + LOCK (&conf->lock); + { + __aha_set_timer_status (conf, _gf_true); + } + UNLOCK (&conf->lock); + + aha_force_unwind_fops ((struct aha_conf *)data); +} + +void +__aha_start_timer (struct aha_conf *conf) +{ + struct timespec child_down_timeout = { + .tv_sec = conf->server_wait_timeout, + .tv_nsec = 0 + }; + + __aha_set_timer_status (conf, _gf_false); + + conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout, + aha_child_down_timer_expired, conf); + if (!conf->timer) { + gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!"); + } + + gf_log (GF_AHA, GF_LOG_INFO, + "Registered timer for %lu seconds.", + conf->server_wait_timeout); +} + +void +__aha_cancel_timer (struct aha_conf *conf) +{ + if (!conf->timer) + goto out; + + gf_timer_call_cancel (conf->this->ctx, conf->timer); + conf->timer = NULL; + gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!"); +out: + return; +} + +void +__aha_update_child_status (struct aha_conf *conf, int status) +{ + conf->child_up = status; +} + +void +aha_handle_child_up (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status ( + conf, AHA_CHILD_STATUS_UP); /* Mark the child as up */ + __aha_set_timer_status ( + conf, _gf_false); /* Timer is no longer expired */ + __aha_cancel_timer (conf); /* Cancel the timer */ + } + UNLOCK (&conf->lock); +} + +void +aha_handle_child_down (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN); + __aha_set_timer_status (conf, _gf_true); + __aha_start_timer (conf); + } + UNLOCK (&conf->lock); +} + +int32_t +notify (xlator_t *this, int32_t event, void *data, ...) +{ + switch (event) { + case GF_EVENT_CHILD_DOWN: + gf_log (this->name, GF_LOG_WARNING, "Got child-down event!"); + aha_handle_child_down (this); + break; + case GF_EVENT_CHILD_UP: + gf_log (this->name, GF_LOG_WARNING, "Got child-up event!"); + aha_handle_child_up (this); + dispatch_fop_queue_drain (this); + break; + default: + break; + } + + default_notify (this, event, data); + + return 0; +} + +int32_t +aha_priv_dump (xlator_t *this) +{ + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init failed!"); + return ret; + } + + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF ("server-wait-timeout-seconds", + conf->server_wait_timeout, + options, size_uint64, err); + + return 0; +err: + return -1; +} + +int +aha_init_options (xlator_t *this) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_INIT ("server-wait-timeout-seconds", + conf->server_wait_timeout, + size_uint64, err); + + return 0; +err: + return -1; +} + + +int +init (xlator_t *this) +{ + int ret = 0; + struct aha_conf *conf = NULL; + + conf = aha_conf_new (); + if (!conf) { + ret = -(ENOMEM); + goto err; + } + + conf->this = this; + this->private = conf; + + aha_init_options (this); + + /* init() completed successfully */ + goto done; +err: + gf_log (GF_AHA, GF_LOG_ERROR, + "init() failed, please see " + "logs for details."); + + /* Free all allocated memory */ + aha_conf_destroy (conf); +done: + return ret; +} + +void +fini (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + aha_conf_destroy (conf); + + this->private = NULL; +} + +struct xlator_dumpops dumpops = { + .priv = aha_priv_dump, +}; + +struct xlator_fops cbks; + +struct xlator_fops fops = { + .lookup = aha_lookup, + .stat = aha_stat, + .readlink = aha_readlink, + .mknod = aha_mknod, + .mkdir = aha_mkdir, + .unlink = aha_unlink, + .rmdir = aha_rmdir, + .symlink = aha_symlink, + .rename = aha_rename, + .link = aha_link, + .truncate = aha_truncate, + .create = aha_create, + .open = aha_open, + .readv = aha_readv, + .writev = aha_writev, + .statfs = aha_statfs, + .flush = aha_flush, + .fsync = aha_fsync, + .setxattr = aha_setxattr, + .getxattr = aha_getxattr, + .removexattr = aha_removexattr, + .fsetxattr = aha_fsetxattr, + .fgetxattr = aha_fgetxattr, + .fremovexattr = aha_fremovexattr, + .opendir = aha_opendir, + .readdir = aha_readdir, + .readdirp = aha_readdirp, + .fsyncdir = aha_fsyncdir, + .access = aha_access, + .ftruncate = aha_ftruncate, + .fstat = aha_fstat, + .lk = aha_lk, + .lookup_cbk = aha_lookup_cbk, + .xattrop = aha_xattrop, + .fxattrop = aha_fxattrop, + .inodelk = aha_inodelk, + .finodelk = aha_finodelk, + .entrylk = aha_entrylk, + .fentrylk = aha_fentrylk, + .setattr = aha_setattr, + .fsetattr = aha_fsetattr, +}; + +struct volume_options options[] = { + { .key = {"server-wait-timeout-seconds"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 10, + .max = 20 * 60, + .default_value = TOSTRING (120), + .description = "Specifies the number of seconds the " + "AHA translator will wait " + "for a CHILD_UP event before " + "force-unwinding the frames it has " + "currently stored for retry." + }, + { .key = {NULL} } +}; diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h new file mode 100644 index 00000000000..3dbf3199776 --- /dev/null +++ b/xlators/cluster/aha/src/aha.h @@ -0,0 +1,46 @@ +#ifndef _AHA_H +#define _AHA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "statedump.h" +#include "call-stub.h" +#include "defaults.h" +#include "list.h" +#include "timer.h" + +#include "aha-mem-types.h" + +/* new() and destroy() functions for all structs can be found in + * aha-helpers.c + */ +struct aha_conf { + xlator_t *this; + uint8_t child_up; + gf_lock_t lock; + struct list_head failed; + gf_timer_t *timer; + gf_boolean_t timer_expired; + uint64_t server_wait_timeout; +}; + +struct aha_fop { + call_stub_t *stub; /* Only used to store function arguments */ + call_frame_t *frame; /* Frame corresponding to this fop */ + uint64_t tries; + struct list_head list; +}; + +enum { + AHA_CHILD_STATUS_DOWN = 0, + AHA_CHILD_STATUS_UP = 1, + AHA_CHILD_STATUS_MAX +}; + +gf_boolean_t aha_is_timer_expired (struct aha_conf *conf); + +#endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index a9714b02b79..a97d03bb055 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -5559,6 +5559,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -5571,9 +5572,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, subvol, subvol->fops->mknod, loc, mode, rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + /* This will return NULL if all subvolumes are full + * and/or no subvolume needs the min_free_disk limit + */ + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->rdev = rdev; local->mode = mode; @@ -5603,6 +5610,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + return op_errno; } int32_t @@ -6242,8 +6251,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this, } } - dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode, - umask, params); + op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, + rdev, mode, umask, + params); + if (op_errno != 0) { + goto err; + } done: return 0; @@ -6738,6 +6751,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -6752,8 +6766,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } else { avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; local->mode = mode; @@ -6780,6 +6796,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return op_errno; } int @@ -6882,9 +6902,10 @@ dht_create_do (call_frame_t *frame) goto err; } - dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, - local->flags, local->mode, - local->umask, local->fd, local->params); + dht_create_wind_to_avail_subvol (frame, this, subvol, + &local->loc, local->flags, + local->mode, local->umask, + local->fd, local->params); return 0; err: local->refresh_layout_unlock (frame, this, -1, 1); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9e9ca712417..613a9d39816 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -300,6 +300,7 @@ struct dht_du { uint64_t avail_space; uint32_t log; uint32_t chunks; + gf_boolean_t is_full; }; typedef struct dht_du dht_du_t; @@ -484,6 +485,7 @@ struct dht_conf { dht_du_t *du_stats; double min_free_disk; double min_free_inodes; + gf_boolean_t min_free_strict_mode; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -549,6 +551,10 @@ struct dht_conf { gf_boolean_t lock_migration_enabled; gf_lock_t lock; + + /* du stats */ + uint32_t du_refresh_interval_sec; + gf_lock_t du_refresh_lock; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 1eb9e63c531..1b20dabc61f 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; struct timeval tv = {0,}; + struct timeval cmp_tv = {0,}; loc_t tmp_loc = {0,}; conf = this->private; + /* Somebody else is already refreshing the statfs info */ + if (TRY_LOCK (&conf->du_refresh_lock) != 0) + return 0; + gettimeofday (&tv, NULL); + cmp_tv = conf->last_stat_fetch; + cmp_tv.tv_sec += conf->du_refresh_interval_sec; + /* make it root gfid, should be enough to get the proper info back */ tmp_loc.gfid[15] = 1; - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - + if (timercmp (&tv, &cmp_tv, >)) { statfs_frame = copy_frame (frame); if (!statfs_frame) { goto err; @@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) &tmp_loc, statfs_local->params); } - conf->last_stat_fetch.tv_sec = tv.tv_sec; + conf->last_stat_fetch = tv; } - return 0; + ret = 0; + goto out; err: if (statfs_frame) DHT_STACK_DESTROY (statfs_frame); - return -1; + ret = -1; +out: + UNLOCK (&conf->du_refresh_lock); + return ret; } @@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) conf = this->private; /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { + if (TRY_LOCK (&conf->subvolume_lock) != 0) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + return conf->du_stats[i].is_full; + } + } + } else { for (i = 0; i < conf->subvolume_cnt; i++) { if (subvol == conf->subvolumes[i]) { if (conf->disk_unit == 'p') { @@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } } - } + + /* i will be less than subvolume_cnt if either of + * these booleans are true */ + is_subvol_filled = ( + subvol_filled_space || subvol_filled_inodes); + if (is_subvol_filled) { + conf->du_stats[i].is_full = is_subvol_filled; + } + } UNLOCK (&conf->subvolume_lock); if (subvol_filled_space && conf->subvolume_status[i]) { @@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } - is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); - return is_subvol_filled; } @@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol, layout); - if(!avail_subvol) - { - avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol, - layout); - } - } UNLOCK (&conf->subvolume_lock); out: @@ -325,7 +339,6 @@ out: gf_msg_debug (this->name, 0, "No subvolume has enough free space \ and/or inodes to create"); - avail_subvol = subvol; } if (layout) diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 549f1b9ea7e..e320109c796 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, loc); local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); if (!local) { @@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true && + flags & O_APPEND) { + op_errno = ENOSPC; + goto err; } local->rebalance.flags = flags; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 112685b659e..7420461da76 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + loc_t *nil_loc = {0,}; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); if (!local) { @@ -173,15 +178,21 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, nil_loc); + subvol = local->cached_subvol; if (!subvol) { gf_msg_debug (this->name, 0, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true) { + op_errno = ENOSPC; + goto err; } - local->rebalance.vector = iov_dup (vector, count); local->rebalance.offset = off; local->rebalance.count = count; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 1d145855ed7..10fd878041e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -20,7 +20,7 @@ #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ -#define DHT_REBALANCE_BLKSIZE (128 * 1024) +#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */ #define MAX_MIGRATE_QUEUE_COUNT 500 #define MIN_MIGRATE_QUEUE_COUNT 200 diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 5c810f0dc77..ccbf66b626d 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options) conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; + GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode, + options, bool, out); GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, percent, out); @@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options, bool, out); + + GF_OPTION_RECONF ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, options, uint32, out); ret = 0; out: return ret; @@ -720,7 +725,10 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + err); + + GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode, + bool, err); GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); @@ -738,6 +746,11 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled, bool, err); + GF_OPTION_INIT ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, uint32, err); + + LOCK_INIT (&conf->du_refresh_lock); + if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -907,6 +920,14 @@ struct volume_options options[] = { "process starts balancing out the cluster, and logs will appear " "in log files", }, + { .key = {"min-free-strict-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "When enabled, will reject in-flight writes or " + "append operations to files when the target subvolume falls " + "below min-free-(disk|inodes). When disabled, these are allowed " + "through and only new files will be affected.", + }, { .key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", @@ -1089,5 +1110,14 @@ struct volume_options options[] = { " associated with a file during rebalance" }, + { .key = {"du-refresh-interval-sec"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "60", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies how many seconds before subvolume statfs " + "info is re-validated." + }, + { .key = {NULL} }, }; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 56e17d6e884..996faffa37f 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->params = dict_ref (params); local->mode = mode; @@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index f1e9a399442..8b14ac99b8f 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->mode = mode; local->flags = flags; @@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index c21417a0192..69f182c5194 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -91,9 +91,13 @@ typedef struct _ios_sample_t { uid_t uid; gid_t gid; char identifier[UNIX_PATH_MAX]; + char path[UNIX_PATH_MAX]; glusterfs_fop_t fop_type; struct timeval timestamp; double elapsed; + gf_boolean_t have_path; + int32_t op_ret; + int32_t op_errno; } ios_sample_t; @@ -178,10 +182,33 @@ typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*, int , int , uint64_t ) ; struct ios_local { - struct timeval wind_at; - struct timeval unwind_at; + inode_t *inode; + loc_t loc; + fd_t *fd; }; +static struct ios_local * +ios_local_new() { + return GF_CALLOC (1, sizeof (struct ios_local), + gf_common_mt_char); +} + +static void +ios_local_free (struct ios_local *local) +{ + if (!local) + return; + + inode_unref (local->inode); + + if (local->fd) + fd_unref (local->fd); + + loc_wipe (&local->loc); + memset (local, 0, sizeof (*local)); + GF_FREE (local); +} + struct volume_options options[]; static int @@ -192,6 +219,57 @@ is_fop_latency_started (call_frame_t *frame) return memcmp (&frame->begin, &epoch, sizeof (epoch)); } +static void +ios_free_local (call_frame_t *frame) +{ + struct ios_local *local = frame->local; + + ios_local_free (local); + + frame->local = NULL; +} + +static void +ios_track_loc (call_frame_t *frame, loc_t *loc) +{ + struct ios_local *local = NULL; + + if (loc && loc->path) { + /* Check if frame->local is already set (it should + * only be set by either ios_track_loc() or + * ios_track_fd()). In other words, this check + * allows us to chain calls to ios_track_loc() + * and ios_track_fd() without clobbering frame->local + * in the process. + */ + if (frame->local) { + local = frame->local; + } else { + local = ios_local_new (); + } + loc_copy (&local->loc, loc); + frame->local = local; + } +} + +static void +ios_track_fd (call_frame_t *frame, fd_t *fd) +{ + struct ios_local *local = NULL; + + if (fd && fd->inode) { + if (frame->local) { + local = frame->local; + } else { + local = ios_local_new (); + } + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + frame->local = local; + } +} + + #define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples" #ifdef GF_LINUX_HOST_OS #define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats" @@ -206,7 +284,7 @@ is_fop_latency_started (call_frame_t *frame) conf = this->private; \ if (conf && conf->measure_latency) { \ gettimeofday (&frame->end, NULL); \ - update_ios_latency (conf, frame, GF_FOP_##op); \ + update_ios_latency (conf, frame, GF_FOP_##op, 0, 0); \ } \ } while (0) @@ -244,7 +322,7 @@ is_fop_latency_started (call_frame_t *frame) #define STATS_ADD(x,i) (x) += (i) #endif -#define UPDATE_PROFILE_STATS(frame, op) \ +#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \ do { \ struct ios_conf *conf = NULL; \ \ @@ -257,7 +335,8 @@ is_fop_latency_started (call_frame_t *frame) conf->count_fop_hits) { \ BUMP_FOP(op); \ gettimeofday (&frame->end, NULL); \ - update_ios_latency (conf, frame, GF_FOP_##op);\ + update_ios_latency (conf, frame, GF_FOP_##op, \ + op_ret, op_errno); \ } \ } \ STATS_UNLOCK (&conf->lock); \ @@ -694,7 +773,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this, int _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { - char *key_root = "gluster"; + char *key_root = "storage.gluster"; char *xlator_name = NULL; char *instance_name = NULL; size_t key_len = 0; @@ -719,7 +798,7 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { } if (strcmp (__progname, "glusterfsd") == 0) - key_root = "gluster.brick"; + key_root = "storage.gluster.brick"; if (instance_name) { /* +3 for 2 x "." + NULL */ @@ -1010,7 +1089,10 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, char *port_pos = NULL; char *group_name = NULL; char *username = NULL; + char *path = NULL; struct ios_conf *conf = NULL; + const char *error_string = NULL; + int32_t op_errno = 0; conf = this->private; @@ -1057,12 +1139,22 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, sprintf (group_name, "%d", (int32_t)sample->gid); } + path = "Unknown"; + if (sample->have_path) + path = sample->path; + + error_string = "No Error"; + if (sample->op_ret != 0) { + op_errno = abs (sample->op_errno); + error_string = strerror (op_errno); + } + ios_log (this, logfp, - "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s", + "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s", epoch_time, fop_enum_to_pri_string (sample->fop_type), fop_enum_to_string (sample->fop_type), sample->elapsed, xlator_name, instance_name, username, - group_name, hostname, port); + group_name, hostname, port, path, op_errno, error_string); goto out; err: gf_log (this->name, GF_LOG_ERROR, @@ -1608,14 +1700,87 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd) return 0; } +void ios_local_get_inode (struct ios_local *local, inode_t **inode) +{ + if (!local) + return; + + /* In the cases that a loc is given to us, + * we should use that as the source of truth + * for the inode. + */ + if (local->loc.inode) { + *inode = local->loc.inode; + return; + } + + /* Fall back to the inode in the local struct, + * but there is no guarantee this will be a valid + * pointer. + */ + *inode = local->inode; +} + +void ios_local_get_path (call_frame_t *frame, const char **path) +{ + struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; + inode_t *inode = NULL; + + local = frame->local; + if (!local) + goto out; + + ios_local_get_inode (local, &inode); + + if (inode) { + /* Each inode shold have an iosstat struct attached to it. + * This is the preferred way to retrieve the path. + */ + ios_inode_ctx_get (inode, frame->this, &iosstat); + if (iosstat) { + gf_log ("io-stats", GF_LOG_DEBUG, + "[%s] Getting path from iostat struct", + fop_enum_to_string (frame->op)); + *path = iosstat->filename; + goto out; + } + } + + /* If we don't have the iosstat attached to the inode, + * fall back to retrieving the path via the loc struct + * inside the local. + */ + if (local->loc.path) { + gf_log ("io-stats", GF_LOG_DEBUG, + "[%s] Getting path from loc_t", + fop_enum_to_string (frame->op)); + *path = local->loc.path; + goto out; + } + +out: + /* If the inode and the loc don't have the path, we're out of luck. + */ + if (!*path) { + gf_log ("io-stats", GF_LOG_DEBUG, + "Unable to get path for fop: %s", + fop_enum_to_string (frame->op)); + } + + return; +} + void collect_ios_latency_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, double elapsed, - call_frame_t *frame) + call_frame_t *frame, int32_t op_ret, int32_t op_errno) { + struct ios_local *ios_local = NULL; ios_sample_buf_t *ios_sample_buf = NULL; ios_sample_t *ios_sample = NULL; struct timeval *timestamp = NULL; call_stack_t *root = NULL; + const char *path = NULL; ios_sample_buf = conf->ios_sample_buf; @@ -1630,6 +1795,8 @@ void collect_ios_latency_sample (struct ios_conf *conf, ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]); ios_sample->elapsed = elapsed; ios_sample->fop_type = fop_type; + ios_sample->op_ret = op_ret; + ios_sample->op_errno = op_errno; ios_sample->uid = root->uid; ios_sample->gid = root->gid; (ios_sample->timestamp).tv_sec = timestamp->tv_sec; @@ -1637,6 +1804,52 @@ void collect_ios_latency_sample (struct ios_conf *conf, memcpy (&ios_sample->identifier, &root->identifier, sizeof (root->identifier)); + /* Eventually every FOP will be supported + * (i.e., the frame->local will be + * of type struct ios_local), but for now, this is a safety. + */ + switch (ios_sample->fop_type) { + + case GF_FOP_CREATE: + case GF_FOP_OPEN: + case GF_FOP_STAT: + case GF_FOP_FSTAT: + case GF_FOP_READ: + case GF_FOP_WRITE: + case GF_FOP_OPENDIR: + case GF_FOP_READDIRP: + case GF_FOP_READDIR: + case GF_FOP_FLUSH: + case GF_FOP_ACCESS: + case GF_FOP_UNLINK: + case GF_FOP_TRUNCATE: + case GF_FOP_MKDIR: + case GF_FOP_RMDIR: + case GF_FOP_SETATTR: + case GF_FOP_LOOKUP: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FXATTROP: + case GF_FOP_XATTROP: + case GF_FOP_GETXATTR: + case GF_FOP_FGETXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_STATFS: + case GF_FOP_FSYNC: + ios_local_get_path (frame, &path); + break; + default: + path = NULL; + break; + } + + if (path) { + strncpy (ios_sample->path, path, sizeof (ios_sample->path)); + ios_sample->have_path = _gf_true; + } + /* We've reached the end of the circular buffer, start from the * beginning. */ if (ios_sample_buf->pos == (ios_sample_buf->size - 1)) @@ -1674,7 +1887,7 @@ update_ios_latency_stats (struct ios_global_stats *stats, double elapsed, int update_ios_latency (struct ios_conf *conf, call_frame_t *frame, - glusterfs_fop_t op) + glusterfs_fop_t op, int32_t op_ret, int32_t op_errno) { double elapsed; struct timeval *begin, *end; @@ -1687,7 +1900,7 @@ update_ios_latency (struct ios_conf *conf, call_frame_t *frame, update_ios_latency_stats (&conf->cumulative, elapsed, op); update_ios_latency_stats (&conf->incremental, elapsed, op); - collect_ios_latency_sample (conf, op, elapsed, frame); + collect_ios_latency_sample (conf, op, elapsed, frame, op_ret, op_errno); return 0; } @@ -1811,40 +2024,100 @@ unlock_list_head: return ret; } +static int +attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, + const uuid_t gfid) { + struct ios_stat *iosstat = NULL; + + if (!inode) { + return -EINVAL; + } + + ios_inode_ctx_get (inode, this, &iosstat); + if (!iosstat) { + iosstat = GF_CALLOC (1, sizeof (*iosstat), + gf_io_stats_mt_ios_stat); + if (!iosstat) { + return -ENOMEM; + } + iosstat->filename = gf_strdup (path); + gf_uuid_copy (iosstat->gfid, gfid); + LOCK_INIT (&iosstat->lock); + ios_inode_ctx_set (inode, this, iosstat); + } + + return 0; +} + + +int +ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd) +{ + struct ios_fd *ifd = NULL; + int ret = 0; + + ifd = GF_CALLOC (1, sizeof (*ifd), gf_io_stats_mt_ios_fd); + if (!ifd) { + ret = -ENOMEM; + goto free_and_out; + } + + if (path) { + ifd->filename = gf_strdup (path); + if (!ifd->filename) { + ret = -ENOMEM; + goto free_and_out; + } + } + + gettimeofday (&ifd->opened_at, NULL); + + if (fd) + ios_fd_ctx_set (fd, this, ifd); + + *iosfd = ifd; + + return ret; + + /* Failure path */ +free_and_out: + if (ifd) { + GF_FREE (ifd->filename); + GF_FREE (ifd); + } + + *iosfd = NULL; + + return ret; +} + + int io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - struct ios_fd *iosfd = NULL; - char *path = NULL; - struct ios_stat *iosstat = NULL; - struct ios_conf *conf = NULL; - - conf = this->private; + struct ios_local *local = NULL; + struct ios_conf *conf = NULL; + struct ios_fd *iosfd = NULL; - path = frame->local; - frame->local = NULL; - - if (!path) + if (op_ret < 0) { goto unwind; + } - if (op_ret < 0) { - GF_FREE (path); + local = frame->local; + if (!local) { goto unwind; } - iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); + conf = this->private; + + ios_build_fd (this, local->loc.path, fd, &iosfd); if (!iosfd) { - GF_FREE (path); goto unwind; } - iosfd->filename = path; - gettimeofday (&iosfd->opened_at, NULL); - - ios_fd_ctx_set (fd, this, iosfd); LOCK (&conf->lock); { conf->cumulative.nr_opens++; @@ -1855,18 +2128,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&conf->lock); - iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); - if (!iosstat) { - GF_FREE (path); - goto unwind; - } - iosstat->filename = gf_strdup (path); - gf_uuid_copy (iosstat->gfid, buf->ia_gfid); - LOCK_INIT (&iosstat->lock); - ios_inode_ctx_set (fd->inode, this, iosstat); + attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, + buf->ia_gfid); unwind: - UPDATE_PROFILE_STATS (frame, CREATE); + UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, preparent, postparent, xdata); return 0; @@ -1877,44 +2144,24 @@ int io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - struct ios_fd *iosfd = NULL; - char *path = NULL; - struct ios_stat *iosstat = NULL; - struct ios_conf *conf = NULL; - - conf = this->private; - path = frame->local; - frame->local = NULL; - - if (!path) - goto unwind; + struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; + struct ios_conf *conf = NULL; + struct ios_fd *iosfd = NULL; if (op_ret < 0) { - GF_FREE (path); goto unwind; } - iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); - if (!iosfd) { - GF_FREE (path); + local = frame->local; + if (!local) { goto unwind; } - iosfd->filename = path; - gettimeofday (&iosfd->opened_at, NULL); - - ios_fd_ctx_set (fd, this, iosfd); - - ios_inode_ctx_get (fd->inode, this, &iosstat); - if (!iosstat) { - iosstat = GF_CALLOC (1, sizeof (*iosstat), - gf_io_stats_mt_ios_stat); - if (iosstat) { - iosstat->filename = gf_strdup (path); - gf_uuid_copy (iosstat->gfid, fd->inode->gfid); - LOCK_INIT (&iosstat->lock); - ios_inode_ctx_set (fd->inode, this, iosstat); - } + conf = this->private; + ios_build_fd (this, local->loc.path, fd, &iosfd); + if (!iosfd) { + goto unwind; } LOCK (&conf->lock); @@ -1926,13 +2173,19 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&conf->lock); + + ios_inode_ctx_get (fd->inode, this, &iosstat); if (iosstat) { BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN); - iosstat = NULL; } -unwind: - UPDATE_PROFILE_STATS (frame, OPEN); + attach_iosstat_to_inode (this, local->loc.inode, + local->loc.path, + local->loc.inode->gfid); + +unwind: + UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); return 0; @@ -1943,7 +2196,8 @@ int io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, STAT); + UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -1956,26 +2210,29 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iobref *iobref, dict_t *xdata) { int len = 0; - fd_t *fd = NULL; struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; - fd = frame->local; - frame->local = NULL; + local = frame->local; + if (!local || !local->fd) + goto unwind; if (op_ret > 0) { len = iov_length (vector, count); - BUMP_READ (fd, len); + BUMP_READ (local->fd, len); } - UPDATE_PROFILE_STATS (frame, READ); - ios_inode_ctx_get (fd->inode, this, &iosstat); + UPDATE_PROFILE_STATS (frame, READ, op_ret, op_errno); + ios_inode_ctx_get (local->fd->inode, this, &iosstat); if (iosstat) { - BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); - BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); - iosstat = NULL; + BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); + BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); + } +unwind: + ios_free_local (frame); STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, buf, iobref, xdata); return 0; @@ -1989,21 +2246,23 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; inode_t *inode = NULL; - UPDATE_PROFILE_STATS (frame, WRITE); - if (frame->local){ - inode = frame->local; - frame->local = NULL; - ios_inode_ctx_get (inode, this, &iosstat); - if (iosstat) { - BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); - BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); - inode = NULL; - iosstat = NULL; - } - } + local = frame->local; + if (!local || !local->fd) + goto unwind; + + UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno); + + ios_inode_ctx_get (local->inode, this, &iosstat); + if (iosstat) { + BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); + BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); + } +unwind: + ios_free_local (frame); STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2021,7 +2280,7 @@ io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, frame->local = NULL; - UPDATE_PROFILE_STATS (frame, READDIRP); + UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno); ios_inode_ctx_get (inode, this, &iosstat); @@ -2039,7 +2298,16 @@ int io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, READDIR); + struct ios_local *local = NULL; + struct ios_stat *iosstat = NULL; + + local = frame->local; + + UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); + + ios_free_local (frame); + + UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2050,8 +2318,10 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSYNC); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } @@ -2061,7 +2331,8 @@ io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SETATTR); + UPDATE_PROFILE_STATS (frame, SETATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, xdata); return 0; } @@ -2072,7 +2343,8 @@ io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, UNLINK); + UPDATE_PROFILE_STATS (frame, UNLINK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, postparent, xdata); return 0; @@ -2086,7 +2358,7 @@ io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, RENAME); + UPDATE_PROFILE_STATS (frame, RENAME, op_ret, op_errno); STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent, postoldparent, prenewparent, postnewparent, xdata); @@ -2099,7 +2371,8 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *sbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, READLINK); + UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata); return 0; } @@ -2111,7 +2384,14 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - UPDATE_PROFILE_STATS (frame, LOOKUP); + struct ios_local *local = frame->local; + + if (local && local->loc.path && inode && op_ret >= 0) { + attach_iosstat_to_inode (this, inode, local->loc.path, + inode->gfid); + } + UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata, postparent); return 0; @@ -2124,7 +2404,7 @@ io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SYMLINK); + UPDATE_PROFILE_STATS (frame, SYMLINK, op_ret, op_errno); STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2137,7 +2417,7 @@ io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, MKNOD); + UPDATE_PROFILE_STATS (frame, MKNOD, op_ret, op_errno); STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2151,28 +2431,16 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - struct ios_stat *iosstat = NULL; - char *path = frame->local; + struct ios_local *local = frame->local; - if (!path) - goto unwind; - - UPDATE_PROFILE_STATS (frame, MKDIR); - if (op_ret < 0) - goto unwind; - - iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); - if (iosstat) { - LOCK_INIT (&iosstat->lock); - iosstat->filename = gf_strdup(path); - gf_uuid_copy (iosstat->gfid, buf->ia_gfid); - ios_inode_ctx_set (inode, this, iosstat); + if (local && local->loc.path) { + local->inode = inode_ref (inode); + attach_iosstat_to_inode (this, inode, local->loc.path, + buf->ia_gfid); } -unwind: - /* local is assigned with path */ - GF_FREE (frame->local); - frame->local = NULL; + UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2185,7 +2453,7 @@ io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, LINK); + UPDATE_PROFILE_STATS (frame, LINK, op_ret, op_errno); STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2196,7 +2464,8 @@ int io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FLUSH); + UPDATE_PROFILE_STATS (frame, FLUSH, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); return 0; } @@ -2206,20 +2475,28 @@ int io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - struct ios_stat *iosstat = NULL; - int ret = -1; + struct ios_local *local = NULL; + struct ios_stat *iosstat = NULL; + int ret = -1; + + local = frame->local; + if (!local || !local->fd) + goto unwind; - UPDATE_PROFILE_STATS (frame, OPENDIR); if (op_ret < 0) goto unwind; - ios_fd_ctx_set (fd, this, 0); + attach_iosstat_to_inode (this, local->inode, local->loc.path, + local->inode->gfid); - ret = ios_inode_ctx_get (fd->inode, this, &iosstat); - if (!ret) + ios_fd_ctx_set (local->fd, this, 0); + ios_inode_ctx_get (local->fd->inode, this, &iosstat); + if (iosstat) BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR); unwind: + UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); return 0; } @@ -2231,8 +2508,8 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, RMDIR); - + UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, postparent, xdata); return 0; @@ -2244,7 +2521,8 @@ io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, TRUNCATE); + UPDATE_PROFILE_STATS (frame, TRUNCATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2255,7 +2533,8 @@ int io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, STATFS); + UPDATE_PROFILE_STATS (frame, STATFS, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2265,7 +2544,8 @@ int io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SETXATTR); + UPDATE_PROFILE_STATS (frame, SETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2275,7 +2555,8 @@ int io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, GETXATTR); + UPDATE_PROFILE_STATS (frame, GETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2285,7 +2566,8 @@ int io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, REMOVEXATTR); + UPDATE_PROFILE_STATS (frame, REMOVEXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2294,7 +2576,8 @@ int io_stats_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSETXATTR); + UPDATE_PROFILE_STATS (frame, FSETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2304,7 +2587,8 @@ int io_stats_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FGETXATTR); + UPDATE_PROFILE_STATS (frame, FGETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2314,7 +2598,8 @@ int io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FREMOVEXATTR); + UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2324,7 +2609,8 @@ int io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSYNCDIR); + UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); return 0; } @@ -2334,7 +2620,20 @@ int io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, ACCESS); + struct ios_local *local = frame->local; + + /* ACCESS is called before a READ when a fop fails over + * in NFS. We need to make sure that we are attaching the + * data correctly to this inode. + */ + if (local->loc.inode && local->loc.path) { + attach_iosstat_to_inode (this, local->loc.inode, + local->loc.path, + local->loc.inode->gfid); + } + + UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); return 0; } @@ -2345,7 +2644,8 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FTRUNCATE); + UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2356,7 +2656,8 @@ int io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSTAT); + UPDATE_PROFILE_STATS (frame, FSTAT, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2367,8 +2668,9 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, FALLOCATE); - STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -2379,8 +2681,9 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, DISCARD); - STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -2390,7 +2693,8 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, ZEROFILL); + UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2400,7 +2704,8 @@ int io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, LK); + UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata); return 0; } @@ -2410,7 +2715,8 @@ int io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, ENTRYLK); + UPDATE_PROFILE_STATS (frame, ENTRYLK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata); return 0; } @@ -2420,7 +2726,8 @@ int io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, XATTROP); + UPDATE_PROFILE_STATS (frame, XATTROP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2430,7 +2737,8 @@ int io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FXATTROP); + UPDATE_PROFILE_STATS (frame, FXATTROP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2440,7 +2748,8 @@ int io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, INODELK); + UPDATE_PROFILE_STATS (frame, INODELK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata); return 0; } @@ -2450,6 +2759,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { + ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_entrylk_cbk, @@ -2464,6 +2775,7 @@ int io_stats_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); @@ -2479,8 +2791,8 @@ int io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - - UPDATE_PROFILE_STATS (frame, FINODELK); + UPDATE_PROFILE_STATS (frame, FINODELK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata); return 0; } @@ -2490,6 +2802,7 @@ int io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_finodelk_cbk, @@ -2504,6 +2817,7 @@ int io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_xattrop_cbk, @@ -2518,6 +2832,7 @@ int io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fxattrop_cbk, @@ -2532,6 +2847,7 @@ int io_stats_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_lookup_cbk, @@ -2545,6 +2861,7 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this, int io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_stat_cbk, @@ -2559,6 +2876,7 @@ int io_stats_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_readlink_cbk, @@ -2573,6 +2891,7 @@ int io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_mknod_cbk, @@ -2587,9 +2906,7 @@ int io_stats_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { - if (loc->path) - frame->local = gf_strdup (loc->path); - + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_mkdir_cbk, @@ -2604,6 +2921,7 @@ int io_stats_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_unlink_cbk, @@ -2618,6 +2936,7 @@ int io_stats_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_rmdir_cbk, @@ -2674,6 +2993,7 @@ int io_stats_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_setattr_cbk, @@ -2688,6 +3008,7 @@ int io_stats_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_truncate_cbk, @@ -2702,8 +3023,8 @@ int io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - if (loc->path) - frame->local = gf_strdup (loc->path); + ios_track_loc (frame, loc); + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); @@ -2719,9 +3040,10 @@ int io_stats_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) + { - if (loc->path) - frame->local = gf_strdup (loc->path); + ios_track_loc (frame, loc); + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); @@ -2737,8 +3059,7 @@ int io_stats_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - frame->local = fd; - + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_readv_cbk, @@ -2756,9 +3077,12 @@ io_stats_writev (call_frame_t *frame, xlator_t *this, uint32_t flags, struct iobref *iobref, dict_t *xdata) { int len = 0; + struct ios_conf *conf = NULL; + struct ios_local *local = NULL; + int ret = 0; + + ios_track_fd (frame, fd); - if (fd->inode) - frame->local = fd->inode; len = iov_length (vector, count); BUMP_WRITE (fd, len); @@ -2777,6 +3101,7 @@ int io_stats_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_statfs_cbk, @@ -2791,6 +3116,7 @@ int io_stats_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_flush_cbk, @@ -2805,6 +3131,7 @@ int io_stats_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fsync_cbk, @@ -2971,7 +3298,7 @@ _ios_dump_thread (xlator_t *this) { stats_filename, strerror(errno)); log_stats_fopen_failure = _gf_false; } - samples_logfp = fopen (samples_filename, "w+"); + samples_logfp = fopen (samples_filename, "a"); if (samples_logfp) { io_stats_dump_latency_samples_logfp (this, samples_logfp); @@ -3024,6 +3351,8 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this, goto out; } + ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_setxattr_cbk, @@ -3042,6 +3371,7 @@ int io_stats_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_getxattr_cbk, @@ -3056,6 +3386,7 @@ int io_stats_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_removexattr_cbk, @@ -3071,6 +3402,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fsetxattr_cbk, @@ -3085,6 +3417,7 @@ int io_stats_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fgetxattr_cbk, @@ -3099,6 +3432,7 @@ int io_stats_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fremovexattr_cbk, @@ -3170,6 +3504,7 @@ int io_stats_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_access_cbk, @@ -3212,6 +3547,7 @@ int io_stats_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fstat_cbk, diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c index 270632bc71b..2eb3a9f9149 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc, case RPC_CLNT_DISCONNECT: case RPC_CLNT_MSG: case RPC_CLNT_DESTROY: + case RPC_CLNT_PING: break; } diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c index 77637c7beec..459d173db7f 100644 --- a/xlators/features/changelog/src/changelog-ev-handle.c +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -180,6 +180,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc, /* Free up mydata */ changelog_rpc_clnt_unref (crpc); break; + case RPC_CLNT_PING: + break; } return 0; diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 640c6bb5553..d7c210f24a5 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -234,6 +234,7 @@ blkd: continue; bcount++; + list_del_init (&ilock->client_list); list_del_init (&ilock->blocked_locks); list_add (&ilock->blocked_locks, &released); } @@ -268,6 +269,7 @@ granted: continue; gcount++; + list_del_init (&ilock->client_list); list_del_init (&ilock->list); list_add (&ilock->list, &released); } @@ -321,6 +323,7 @@ blkd: bcount++; + list_del_init (&elock->client_list); list_del_init (&elock->blocked_locks); list_add_tail (&elock->blocked_locks, &released); } @@ -355,6 +358,7 @@ granted: } gcount++; + list_del_init (&elock->client_list); list_del_init (&elock->domain_list); list_add_tail (&elock->domain_list, &removed); diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index d56a7aca2be..c40c29de63a 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -1116,3 +1116,16 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) return conf; } + +gf_boolean_t +pl_does_monkey_want_stuck_lock() +{ + long int monkey_unlock_rand = 0; + long int monkey_unlock_rand_rem = 0; + + monkey_unlock_rand = random (); + monkey_unlock_rand_rem = monkey_unlock_rand % 100; + if (monkey_unlock_rand_rem == 0) + return _gf_true; + return _gf_false; +} diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 5486f9b8314..3729ca24bed 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -161,4 +161,7 @@ pl_metalock_is_active (pl_inode_t *pl_inode); int __pl_queue_lock (pl_inode_t *pl_inode, posix_lock_t *reqlock, int can_block); + +gf_boolean_t +pl_does_monkey_want_stuck_lock(); #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 783c57e6381..4231d760cdc 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -16,9 +16,9 @@ #include "list.h" #include "locks.h" +#include "clear.h" #include "common.h" - void __pl_entrylk_unref (pl_entry_lock_t *lock) { @@ -111,6 +111,97 @@ __conflicting_entrylks (pl_entry_lock_t *l1, pl_entry_lock_t *l2) return 0; } +/* See comments in inodelk.c for details */ +static inline gf_boolean_t +__stale_entrylk (xlator_t *this, pl_entry_lock_t *candidate_lock, + pl_entry_lock_t *requested_lock, time_t *lock_age_sec) +{ + posix_locks_private_t *priv = NULL; + struct timeval curr; + gettimeofday (&curr, NULL); + + priv = this->private; + + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (names_conflict (candidate_lock->basename, + requested_lock->basename)) { + *lock_age_sec = curr.tv_sec - + candidate_lock->granted_time.tv_sec; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; +} + +/* See comments in inodelk.c for details */ +static gf_boolean_t +__entrylk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + args.type = CLRLK_ENTRY; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + + if (list_empty (&dom->entrylk_list)) + goto out; + + pthread_mutex_lock (&pinode->mutex); + lock->pinode = pinode; + list_for_each_entry_safe (lk, tmp, &dom->entrylk_list, domain_list) { + if (__stale_entrylk (this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe (lk, tmp, &dom->blocked_entrylks, + blocked_locks) { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock (&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_entrylk (this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log (this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Entry lock revoked: %d granted & %d " + "blocked locks cleared", reason_str, + uuid_utoa (pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + + return revoke_lock; +} + /** * entrylk_grantable - is this lock grantable? * @inode: inode in which to look @@ -546,6 +637,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pl_ctx_t *ctx = NULL; int nonblock = 0; gf_boolean_t need_inode_unref = _gf_false; + posix_locks_private_t *priv = NULL; + + priv = this->private; if (xdata) dict_ret = dict_get_str (xdata, "connection-id", &conn_id); @@ -599,6 +693,24 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, * current stack unwinds. */ pinode->inode = inode_ref (inode); + if (priv->revocation_secs != 0) { + if (cmd != ENTRYLK_UNLOCK) { + __entrylk_prune_stale (this, pinode, dom, reqlock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock ()) { + gf_log (this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + op_ret = 0; + need_inode_unref = _gf_true; + pthread_mutex_lock (&pinode->mutex); + { + __pl_entrylk_unref (reqlock); + } + pthread_mutex_unlock (&pinode->mutex); + goto out; + } + } + } switch (cmd) { case ENTRYLK_LOCK_NB: @@ -678,9 +790,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, "a bug report at http://bugs.gluster.com", cmd); goto out; } - if (need_inode_unref) - inode_unref (pinode->inode); - /* The following (extra) unref corresponds to the ref that * was done at the time the lock was granted. */ @@ -689,6 +798,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, out: + if (need_inode_unref) + inode_unref (pinode->inode); + if (unwind) { entrylk_trace_out (this, frame, volume, fd, loc, basename, cmd, type, op_ret, op_errno); @@ -772,8 +884,6 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) { list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers, client_list) { - list_del_init (&l->client_list); - pl_entrylk_log_cleanup (l); pinode = l->pinode; @@ -810,6 +920,8 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) * blocked to avoid leaving L1 to starve forever. * iv. unref the object. */ + list_del_init (&l->client_list); + if (!list_empty (&l->domain_list)) { list_del_init (&l->domain_list); list_add_tail (&l->client_list, diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 1564f26b8fb..e1702c78ba1 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -16,6 +16,7 @@ #include "list.h" #include "locks.h" +#include "clear.h" #include "common.h" void @@ -130,6 +131,105 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) inodelk_type_conflict (l1, l2)); } +/* + * Check to see if the candidate lock overlaps/conflicts with the + * requested lock. If so, determine how old the lock is and return + * true if it exceeds the configured threshold, false otherwise. + */ +static inline gf_boolean_t +__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock, + pl_inode_lock_t *requested_lock, time_t *lock_age_sec) +{ + posix_locks_private_t *priv = NULL; + struct timeval curr; + + priv = this->private; + gettimeofday (&curr, NULL); + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (inodelk_conflict (candidate_lock, requested_lock)) { + *lock_age_sec = curr.tv_sec - + candidate_lock->granted_time.tv_sec; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; +} + +/* Examine any locks held on this inode and potentially revoke the lock + * if the age exceeds revocation_secs. We will clear _only_ those locks + * which are granted, and then grant those locks which are blocked. + * + * Depending on how this patch works in the wild, we may expand this and + * introduce a heuristic which clears blocked locks as well if they + * are beyond a threshold. + */ +static gf_boolean_t +__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_inode_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + + args.type = CLRLK_INODE; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty (&dom->inodelk_list)) + goto out; + + pthread_mutex_lock (&pinode->mutex); + list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) { + if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks, + blocked_locks) { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock (&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log (this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Inode lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa (pinode->gfid), dom->domain, + lk_age_sec, gcount, bcount); + } + return revoke_lock; +} + /* Determine if lock is grantable or not */ static pl_inode_lock_t * __inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) @@ -419,8 +519,6 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) { list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers, client_list) { - list_del_init (&l->client_list); - pl_inodelk_log_cleanup (l); pl_inode = l->pl_inode; @@ -458,6 +556,8 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) * forever. * iv. unref the object. */ + list_del_init (&l->client_list); + if (!list_empty (&l->list)) { __delete_inode_lock (l); list_add_tail (&l->client_list, @@ -509,6 +609,7 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, inode_t *inode) { + posix_locks_private_t *priv = NULL; int ret = -EINVAL; pl_inode_lock_t *retlock = NULL; gf_boolean_t unref = _gf_true; @@ -518,6 +619,8 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, lock->pl_inode = pl_inode; fl_type = lock->fl_type; + priv = this->private; + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or * an unsuccessful blocking lock operation, the inode needs to be ref'd. * @@ -537,6 +640,24 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, */ pl_inode->inode = inode_ref (inode); + if (priv->revocation_secs != 0) { + if (lock->fl_type != F_UNLCK) { + __inodelk_prune_stale (this, pl_inode, dom, lock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock ()) { + pthread_mutex_lock (&pl_inode->mutex); + { + __pl_inodelk_unref (lock); + } + pthread_mutex_unlock (&pl_inode->mutex); + inode_unref (pl_inode->inode); + gf_log (this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + return 0; + } + } + } + if (ctx) pthread_mutex_lock (&ctx->lock); pthread_mutex_lock (&pl_inode->mutex); diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index e363f425b65..8eb35da44be 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -190,6 +190,10 @@ typedef struct { mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */ gf_boolean_t trace; /* trace lock requests in and out */ char *brickname; + gf_boolean_t monkey_unlocking; + uint32_t revocation_secs; + gf_boolean_t revocation_clear_all; + uint32_t revocation_max_blocked; } posix_locks_private_t; diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 3415d59324c..7f85ba4fca5 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -3629,7 +3629,21 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("trace", priv->trace, options, bool, out); + GF_OPTION_RECONF ("monkey-unlocking", priv->monkey_unlocking, options, + bool, out); + + GF_OPTION_RECONF ("revocation-secs", + priv->revocation_secs, options, + uint32, out); + + GF_OPTION_RECONF ("revocation-clear-all", priv->revocation_clear_all, + options, bool, out); + + GF_OPTION_RECONF ("revocation-max-blocked", + priv->revocation_max_blocked, options, + uint32, out); ret = 0; + out: return ret; } @@ -3680,6 +3694,18 @@ init (xlator_t *this) GF_OPTION_INIT ("trace", priv->trace, bool, out); + GF_OPTION_INIT ("monkey-unlocking", priv->monkey_unlocking, + bool, out); + + GF_OPTION_INIT ("revocation-secs", priv->revocation_secs, + uint32, out); + + GF_OPTION_INIT ("revocation-clear-all", priv->revocation_clear_all, + bool, out); + + GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked, + uint32, out); + this->local_pool = mem_pool_new (pl_local_t, 32); if (!this->local_pool) { ret = -1; @@ -3936,5 +3962,35 @@ struct volume_options options[] = { .description = "Trace the different lock requests " "to logs." }, + { .key = { "monkey-unlocking" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Ignore a random number of unlock requests. Useful " + "for testing/creating robust lock recovery mechanisms." + }, + { .key = {"revocation-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "Maximum time a lock can be taken out, before" + "being revoked.", + }, + { .key = {"revocation-clear-all"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "If set to true, will revoke BOTH granted and blocked " + "(pending) lock requests if a revocation threshold is " + "hit.", + }, + { .key = {"revocation-max-blocked"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "A number of blocked lock requests after which a lock " + "will be revoked to allow the others to proceed. Can " + "be used in conjunction w/ revocation-clear-all." + }, { .key = {NULL} }, }; diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c index fc2ff2ab10d..f5062971bf4 100644 --- a/xlators/features/snapview-server/src/snapview-server-mgmt.c +++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c @@ -73,7 +73,7 @@ svs_mgmt_init (xlator_t *this) if (cmd_args->volfile_server) host = cmd_args->volfile_server; - ret = rpc_transport_inet_options_build (&options, host, port); + ret = rpc_transport_inet_options_build (&options, host, port, NULL); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to build the " "transport options"); diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index bf62290d023..3c21b9755ea 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -3418,7 +3418,7 @@ glusterd_transport_keepalive_options_get (int *interval, int *time, int glusterd_transport_inet_options_build (dict_t **options, const char *hostname, - int port) + int port, char *addr_family) { dict_t *dict = NULL; int32_t interval = -1; @@ -3433,7 +3433,8 @@ glusterd_transport_inet_options_build (dict_t **options, const char *hostname, port = GLUSTERD_DEFAULT_PORT; /* Build default transport options */ - ret = rpc_transport_inet_options_build (&dict, hostname, port); + ret = rpc_transport_inet_options_build (&dict, hostname, port, + addr_family); if (ret) goto out; @@ -3470,6 +3471,7 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo, int ret = -1; glusterd_peerctx_t *peerctx = NULL; data_t *data = NULL; + char *addr_family = NULL; peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t); if (!peerctx) @@ -3485,9 +3487,15 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo, uniquely identify a peerinfo */ + if (dict_get_str(this->options, "transport.address-family", + &addr_family)) { + addr_family = NULL; + } + ret = glusterd_transport_inet_options_build (&options, peerinfo->hostname, - peerinfo->port); + peerinfo->port, + addr_family); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c index 0ea66a027bf..4fdff3402f5 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handshake.c +++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c @@ -241,6 +241,50 @@ build_volfile_path (char *volume_id, char *path, } + volid_ptr = strstr (volume_id, "gfproxy-client/"); + if (volid_ptr) { + volid_ptr = strchr (volid_ptr, '/'); + if (!volid_ptr) { + ret = -1; + goto out; + } + volid_ptr++; + + ret = glusterd_volinfo_find (volid_ptr, &volinfo); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't find volinfo"); + goto out; + } + + glusterd_get_gfproxy_client_volfile (volinfo, path, path_len); + + ret = 0; + goto out; + } + + volid_ptr = strstr (volume_id, "gfproxy/"); + if (volid_ptr) { + volid_ptr = strchr (volid_ptr, '/'); + if (!volid_ptr) { + ret = -1; + goto out; + } + volid_ptr++; + + ret = glusterd_volinfo_find (volid_ptr, &volinfo); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't find volinfo"); + goto out; + } + + glusterd_get_gfproxyd_volfile (volinfo, path, path_len); + + ret = 0; + goto out; + } + volid_ptr = strstr (volume_id, "/snaps/"); if (volid_ptr) { ret = get_snap_volname_and_volinfo (volid_ptr, &volname, diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index e7ae9b7848d..de5fce5a965 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -10796,6 +10796,45 @@ out: } void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len) +{ + char workdir[PATH_MAX] = {0, }; + glusterd_conf_t *priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + + switch (volinfo->transport_type) { + case GF_TRANSPORT_TCP: + snprintf (path, path_len, + "%s/trusted-%s.tcp-gfproxy-fuse.vol", + workdir, volinfo->volname); + break; + + case GF_TRANSPORT_RDMA: + snprintf (path, path_len, + "%s/trusted-%s.rdma-gfproxy-fuse.vol", + workdir, volinfo->volname); + break; + default: + break; + } +} + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len) +{ + char workdir[PATH_MAX] = {0, }; + glusterd_conf_t *priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + + snprintf (path, path_len, "%s/%s.gfproxyd.vol", workdir, + volinfo->volname); +} + +void glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo, char *path, int path_len) { diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index f4c4138829f..7445407c010 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -642,6 +642,14 @@ void glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo, char *path, int path_len); +void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + int32_t glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo, glusterd_brickinfo_t *dup_brickinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 25fb23f72b2..2344fd169f1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -58,6 +58,20 @@ extern struct volopt_map_entry glusterd_volopt_map[]; } \ } while (0 /* CONSTCOND */) +/** + * Needed for GFProxy + */ +#define GF_PROXY_DAEMON_PORT 40000 +#define GF_PROXY_DAEMON_PORT_STR "40000" + +static int +volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *set_dict, void *param); + +static int +build_client_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *mod_dict); + /********************************************* * * xlator generation / graph manipulation API @@ -1448,6 +1462,75 @@ server_spec_extended_option_handler (volgen_graph_t *graph, static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo); static int +gfproxy_server_graph_builder (volgen_graph_t *graph, + glusterd_volinfo_t *volinfo, + dict_t *set_dict, void *param) +{ + xlator_t *xl = NULL; + char *value = NULL; + char transt[16] = {0, }; + char key[1024] = {0, }; + char port_str[7] = {0, }; + int ret = 0; + char *username = NULL; + char *password = NULL; + int rclusters = 0; + + /* We are a trusted client */ + ret = dict_set_uint32 (set_dict, "trusted-client", GF_CLIENT_TRUSTED); + if (ret != 0) + goto out; + + ret = dict_set_str (set_dict, "gfproxy-server", "on"); + if (ret != 0) + goto out; + + /* Build the client section of the graph first */ + build_client_graph (graph, volinfo, set_dict); + + /* Clear this setting so that future users of set_dict do not end up + * thinking they are a gfproxy server */ + dict_del (set_dict, "gfproxy-server"); + dict_del (set_dict, "trusted-client"); + + /* Then add the server to it */ + get_vol_transport_type (volinfo, transt); + xl = volgen_graph_add (graph, "protocol/server", volinfo->volname); + if (!xl) + goto out; + + ret = xlator_set_option (xl, "listen-port", GF_PROXY_DAEMON_PORT_STR); + if (ret != 0) + goto out; + + ret = xlator_set_option (xl, "transport-type", transt); + if (ret != 0) + goto out; + + /* Set username and password */ + username = glusterd_auth_get_username (volinfo); + password = glusterd_auth_get_password (volinfo); + if (username) { + snprintf (key, sizeof (key), "auth.login.%s-server.allow", + volinfo->volname); + ret = xlator_set_option (xl, key, username); + if (ret) + return -1; + } + + if (password) { + snprintf (key, sizeof (key), "auth.login.%s.password", + username); + ret = xlator_set_option (xl, key, password); + if (ret != 0) + goto out; + } + +out: + return ret; +} + +static int brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, dict_t *set_dict, glusterd_brickinfo_t *brickinfo) { @@ -2541,6 +2624,48 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, } static int +gfproxy_server_perfxl_option_handler (volgen_graph_t *graph, + struct volopt_map_entry *vme, + void *param) +{ + gf_boolean_t enabled = _gf_false; + glusterd_volinfo_t *volinfo = NULL; + + GF_ASSERT (param); + volinfo = param; + + /* write-behind is the *not* allowed for gfproxy-servers */ + if (strstr (vme->key, "write-behind")) { + return 0; + } + + perfxl_option_handler (graph, vme, param); + + return 0; +} + +static int +gfproxy_client_perfxl_option_handler (volgen_graph_t *graph, + struct volopt_map_entry *vme, + void *param) +{ + gf_boolean_t enabled = _gf_false; + glusterd_volinfo_t *volinfo = NULL; + + GF_ASSERT (param); + volinfo = param; + + /* write-behind is the only allowed "perf" for gfproxy-clients */ + if (!strstr (vme->key, "write-behind")) + return 0; + + perfxl_option_handler (graph, vme, param); + + return 0; +} + + +static int nfsperfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, void *param) { @@ -2768,8 +2893,10 @@ _free_xlator_opt_key (char *key) } static xlator_t * -volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, - char *hostname, char *subvol, char *xl_id, +volgen_graph_build_client (volgen_graph_t *graph, + glusterd_volinfo_t *volinfo, + char *hostname, char *port, + char *subvol, char *xl_id, char *transt, dict_t *set_dict) { xlator_t *xl = NULL; @@ -2801,6 +2928,12 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, goto err; } + if (port) { + ret = xlator_set_option (xl, "remote-port", port); + if (ret) + goto err; + } + ret = xlator_set_option (xl, "remote-subvolume", subvol); if (ret) goto err; @@ -2824,7 +2957,8 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, ret = dict_get_uint32 (set_dict, "trusted-client", &client_type); - if (!ret && client_type == GF_CLIENT_TRUSTED) { + if (!ret && (client_type == GF_CLIENT_TRUSTED + || client_type == GF_CLIENT_TRUSTED_PROXY)) { str = NULL; str = glusterd_auth_get_username (volinfo); if (str) { @@ -2911,7 +3045,9 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, i = 0; cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) { xl = volgen_graph_build_client (graph, volinfo, - brick->hostname, brick->path, + brick->hostname, + NULL, + brick->path, brick->brick_id, transt, set_dict); if (!xl) { @@ -3143,8 +3279,9 @@ volgen_graph_build_snapview_client (volgen_graph_t *graph, get_transport_type (volinfo, set_dict, transt, _gf_false); - prot_clnt = volgen_graph_build_client (graph, volinfo, NULL, subvol, - xl_id, transt, set_dict); + prot_clnt = volgen_graph_build_client (graph, volinfo, + NULL, NULL, subvol, + xl_id, transt, set_dict); if (!prot_clnt) { ret = -1; goto out; @@ -3555,6 +3692,27 @@ static int client_graph_set_perf_options(volgen_graph_t *graph, { data_t *tmp_data = NULL; char *volname = NULL; + int ret = 0; + + /* + * Logic to make sure gfproxy-client gets custom performance translators + */ + ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); + if (ret == 1) { + return volgen_graph_set_options_generic ( + graph, set_dict, volinfo, + &gfproxy_client_perfxl_option_handler); + } + + /* + * Logic to make sure gfproxy-server gets custom performance translators + */ + ret = dict_get_str_boolean (set_dict, "gfproxy-server", 0); + if (ret == 1) { + return volgen_graph_set_options_generic ( + graph, set_dict, volinfo, + &gfproxy_server_perfxl_option_handler); + } /* * Logic to make sure NFS doesn't have performance translators by @@ -3768,29 +3926,55 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, char *volname = NULL; glusterd_conf_t *conf = THIS->private; char *tmp = NULL; + char *hostname = NULL; gf_boolean_t var = _gf_false; gf_boolean_t ob = _gf_false; + gf_boolean_t is_gfproxy = _gf_false; int uss_enabled = -1; xlator_t *this = THIS; + char *subvol = NULL; + size_t subvol_namelen = 0; GF_ASSERT (this); GF_ASSERT (conf); - volname = volinfo->volname; - ret = volgen_graph_build_clients (graph, volinfo, set_dict, - param); - if (ret) + ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); + if (ret == -1) goto out; - if (volinfo->type == GF_CLUSTER_TYPE_TIER) - ret = volume_volgen_graph_build_clusters_tier - (graph, volinfo, _gf_false); - else - ret = volume_volgen_graph_build_clusters - (graph, volinfo, _gf_false); + volname = volinfo->volname; + if (ret == 0) { + ret = volgen_graph_build_clients (graph, volinfo, set_dict, + param); + if (ret) + goto out; - if (ret == -1) - goto out; + if (volinfo->type == GF_CLUSTER_TYPE_TIER) + ret = volume_volgen_graph_build_clusters_tier + (graph, volinfo, _gf_false); + else + ret = volume_volgen_graph_build_clusters + (graph, volinfo, _gf_false); + + if (ret == -1) + goto out; + } else { + is_gfproxy = _gf_true; + ret = dict_get_str (set_dict, + "config.gfproxyd-remote-host", &tmp); + if (ret == -1) + goto out; + + subvol_namelen = strlen (volinfo->volname) + + strlen ("-server") + 1; + subvol = alloca (subvol_namelen); + snprintf (subvol, subvol_namelen, + "%s-server", volinfo->volname); + + volgen_graph_build_client (graph, volinfo, tmp, + GF_PROXY_DAEMON_PORT_STR, subvol, + "gfproxy", "tcp", set_dict); + } ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false); if (ret == -1) @@ -3851,6 +4035,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } } + /* gfproxy needs the AHA translator */ + if (is_gfproxy) { + xl = volgen_graph_add (graph, "cluster/aha", volname); + if (!xl) { + ret = -1; + goto out; + } + } + if (conf->op_version == GD_OP_VERSION_MIN) { ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA); @@ -4731,6 +4924,24 @@ out: return ret; } +static int +volgen_graph_set_iam_nfsd (const volgen_graph_t *graph) +{ + xlator_t *trav; + int ret = 0; + + for (trav = first_of ((volgen_graph_t *)graph); trav; + trav = trav->next) { + if (strcmp (trav->type, "cluster/replicate") != 0) + continue; + + ret = xlator_set_option (trav, "iam-nfs-daemon", "yes"); + if (ret) + break; + } + return ret; +} + /* builds a graph for nfs server role, with option overrides in mod_dict */ int build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) @@ -4869,6 +5080,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) if (ret) goto out; + ret = volgen_graph_set_iam_nfsd (&cgraph); + if (ret) + goto out; + ret = volgen_graph_merge_sub (graph, &cgraph, 1); if (ret) goto out; @@ -4930,6 +5145,22 @@ get_brick_filepath (char *filename, glusterd_volinfo_t *volinfo, brickinfo->hostname, brick); } +static void +get_gfproxyd_filepath (char *filename, glusterd_volinfo_t *volinfo) +{ + char path[PATH_MAX] = {0, }; + char brick[PATH_MAX] = {0, }; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv); + + snprintf (filename, PATH_MAX, + "%s/%s.gfproxyd.vol", path, + volinfo->volname); +} + gf_boolean_t glusterd_is_valid_volfpath (char *volname, char *brick) { @@ -4975,6 +5206,32 @@ out: } static int +glusterd_generate_gfproxyd_volfile (glusterd_volinfo_t *volinfo) +{ + volgen_graph_t graph = {0, }; + char filename[PATH_MAX] = {0, }; + int ret = -1; + + GF_ASSERT (volinfo); + + get_gfproxyd_filepath (filename, volinfo); + + struct glusterd_gfproxyd_info info = { + .port = GF_PROXY_DAEMON_PORT, + }; + + ret = build_graph_generic (&graph, volinfo, + NULL, &info, + &gfproxy_server_graph_builder); + if (ret == 0) + ret = volgen_write_volfile (&graph, filename); + + volgen_graph_free (&graph); + + return ret; +} + +static int glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo, dict_t *mod_dict, void *data) @@ -5245,7 +5502,8 @@ glusterd_generate_client_per_brick_volfile (glusterd_volinfo_t *volinfo) cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) { xl = volgen_graph_build_client (&graph, volinfo, - brick->hostname, brick->path, + brick->hostname, + NULL, brick->path, brick->brick_id, "tcp", dict); if (!xl) { @@ -5376,6 +5634,11 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo, ret = glusterd_get_trusted_client_filepath (filepath, volinfo, type); + } else if (client_type == GF_CLIENT_TRUSTED_PROXY) { + glusterd_get_gfproxy_client_volfile (volinfo, + filepath, + PATH_MAX); + ret = dict_set_str (dict, "gfproxy-client", "on"); } else { ret = glusterd_get_client_filepath (filepath, volinfo, @@ -5620,6 +5883,7 @@ build_bitd_volume_graph (volgen_graph_t *graph, xl = volgen_graph_build_client (&cgraph, volinfo, brickinfo->hostname, + NULL, brickinfo->path, brickinfo->brick_id, transt, set_dict); @@ -5782,6 +6046,7 @@ build_scrub_volume_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, xl = volgen_graph_build_client (&cgraph, volinfo, brickinfo->hostname, + NULL, brickinfo->path, brickinfo->brick_id, transt, set_dict); @@ -5913,12 +6178,25 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo) goto out; } + ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED_PROXY); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not generate gfproxy client volfiles"); + goto out; + } + ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER); if (ret) gf_msg (this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL, "Could not generate client volfiles"); + + ret = glusterd_generate_gfproxyd_volfile (volinfo); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Could not generate gfproxy volfiles"); + out: return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index f90177372dc..cb2cad50efc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -52,7 +52,8 @@ typedef enum { GF_CLIENT_TRUSTED, - GF_CLIENT_OTHER + GF_CLIENT_OTHER, + GF_CLIENT_TRUSTED_PROXY, } glusterd_client_type_t; struct volgen_graph { diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index bade4ffb06d..61c79655ccf 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -286,6 +286,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) int32_t type = 0; char *username = NULL; char *password = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif GF_ASSERT (req); @@ -388,10 +393,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) /* Setting default as inet for trans_type tcp */ ret = dict_set_dynstr_with_alloc (dict, "transport.address-family", - "inet"); + addr_family); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to set transport.address-family"); + "failed to set transport.address-family " + "to %s", addr_family); goto out; } } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 1e24adabe0c..bcb8877c5bd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1048,6 +1048,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.min-free-strict-mode", + .voltype = "cluster/distribute", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.min-free-inodes", .voltype = "cluster/distribute", .op_version = 1, @@ -1113,6 +1118,13 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT, }, + { .key = "cluster.du-refresh-interval-sec", + .voltype = "cluster/distribute", + .option = "du-refresh-interval-sec", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, + /* NUFA xlator options (Distribute special case) */ { .key = "cluster.nufa", .voltype = "cluster/distribute", @@ -1461,6 +1473,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "thread-count", .op_version = 1 }, + { .key = "performance.io-thread-fops-per-thread-ratio", + .voltype = "performance/io-threads", + .option = "fops-per-thread-ratio", + .op_version = 1 + }, { .key = "performance.high-prio-threads", .voltype = "performance/io-threads", .op_version = 1 @@ -1555,6 +1572,18 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "performance.write-behind-trickling-writes", + .voltype = "performance/write-behind", + .option = "trickling-writes", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "performance.nfs.write-behind-trickling-writes", + .voltype = "performance/write-behind", + .option = "trickling-writes", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "performance.lazy-open", .voltype = "performance/open-behind", .option = "lazy-open", @@ -2500,6 +2529,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .voltype = "storage/posix", .op_version = GD_OP_VERSION_3_6_0, }, + { .key = "storage.min-free-disk", + .voltype = "storage/posix", + .op_version = 2, + }, + { .key = "storage.freespace-check-interval", + .voltype = "storage/posix", + .op_version = 2, + }, { .key = "storage.bd-aio", .voltype = "storage/bd", .op_version = 3 @@ -2515,6 +2552,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "!config", .op_version = 2 }, + { .key = "config.gfproxyd-remote-host", + .voltype = "configuration", + .option = "gfproxyd-remote-host", + .op_version = 2 + }, { .key = GLUSTERD_QUORUM_TYPE_KEY, .voltype = "mgmt/glusterd", .value = "off", @@ -2961,7 +3003,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { { .key = "cluster.locking-scheme", .voltype = "cluster/replicate", .type = DOC, - .op_version = GD_OP_VERSION_3_7_12, + .op_version = GD_OP_VERSION_3_7_12 , .flags = OPT_FLAG_CLIENT_OPT }, { .key = "cluster.granular-entry-heal", @@ -2970,6 +3012,72 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_8_0, .flags = OPT_FLAG_CLIENT_OPT }, + { .option = "revocation-secs", + .key = "features.locks-revocation-secs", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "revocation-clear-all", + .key = "features.locks-revocation-clear-all", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "revocation-max-blocked", + .key = "features.locks-revocation-max-blocked", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "monkey-unlocking", + .key = "features.locks-monkey-unlocking", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + .type = NO_DOC, + }, + { .key = "cluster.halo-enabled", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-hybrid-mode", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-failover-enabled", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-shd-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-nfsd-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-replicas", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-min-replicas", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-min-samples", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = NULL } }; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index bb6af7f378f..4795f958038 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -222,6 +222,11 @@ struct glusterd_brickinfo { typedef struct glusterd_brickinfo glusterd_brickinfo_t; +struct glusterd_gfproxyd_info { + short port; + char *logfile; +}; + struct gf_defrag_brickinfo_ { char *name; int files; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 6c4cdfed062..598f62fee7a 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -186,6 +186,25 @@ start_glusterfs () fi #options with values start here + if [ -n "$halo_failover_enabled" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-failover-enabled=$halo_failover_enabled"); + fi + if [ -n "$halo_max_latency" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-latency=$halo_max_latency"); + fi + + if [ -n "$halo_max_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-replicas=$halo_max_replicas"); + fi + + if [ -n "$halo_min_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-min-replicas=$halo_min_replicas"); + fi + if [ -n "$log_level" ]; then cmd_line=$(echo "$cmd_line --log-level=$log_level"); fi @@ -479,6 +498,18 @@ with_options() [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts," fuse_mountopts="${fuse_mountopts}$key=\"$value\"" ;; + "halo-max-latency") + halo_max_latency=$value + ;; + "halo-max-replicas") + halo_max_replicas=$value + ;; + "halo-min-replicas") + halo_min_replicas=$value + ;; + "halo-failover-enabled") + halo_failover_enabled=$value + ;; x-*) # comments or userspace application-specific options, drop them ;; diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h index bc9af2f0b8b..0079b9a3deb 100644 --- a/xlators/nfs/server/src/exports.h +++ b/xlators/nfs/server/src/exports.h @@ -22,7 +22,7 @@ #define GF_EXP GF_NFS"-exports" #define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())" -#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)" +#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)" #define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)" #define NETGROUP_MAX_LEN 128 diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c index 48b719d29aa..bff7e0669ff 100644 --- a/xlators/nfs/server/src/mount3.c +++ b/xlators/nfs/server/src/mount3.c @@ -1896,7 +1896,7 @@ _mnt3_get_host_from_peer (const char *peer_addr) size_t host_len = 0; char *colon = NULL; - colon = strchr (peer_addr, ':'); + colon = strrchr (peer_addr, ':'); if (!colon) { gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER, "Bad peer %s", peer_addr); @@ -4123,6 +4123,15 @@ mnt1svc_init (xlator_t *nfsx) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (options, "transport.address-family", "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, + "dict_set_str error when trying to enable ipv6"); + goto err; + } +#endif + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, errno, diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c index e8e226e953e..536a45ede3d 100644 --- a/xlators/nfs/server/src/mount3udp_svc.c +++ b/xlators/nfs/server/src/mount3udp_svc.c @@ -133,7 +133,15 @@ mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp) mountres3 *res = NULL; struct sockaddr_in *sin = NULL; - sin = svc_getcaller (transp); + sin = (struct sockaddr_in *)svc_getcaller (transp); + /* svc_getcaller returns a pointer to a sockaddr_in6, even though it + * might actually be an IPv4 address. It ought return a struct sockaddr + * and make the caller upcast it to the proper address family. Sigh. + * + * Let's make sure that it's actually an IPv4 address. + */ + GF_ASSERT (sin->sin_family == AF_INET); + inet_ntop (AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN+1); switch (rqstp->rq_proc) { diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c index af37f6b264c..a39a0e6ee3a 100644 --- a/xlators/nfs/server/src/nfs-common.c +++ b/xlators/nfs/server/src/nfs-common.c @@ -138,8 +138,12 @@ nfs_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path) gf_uuid_copy (loc->gfid, inode->gfid); } - if (parent) + if (parent) { loc->parent = inode_ref (parent); + if (!gf_uuid_is_null (parent->gfid)) { + gf_uuid_copy (loc->pargfid, parent->gfid); + } + } if (path) { loc->path = gf_strdup (path); diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c index ddfa89dab11..d5087f195ca 100644 --- a/xlators/nfs/server/src/nfs.c +++ b/xlators/nfs/server/src/nfs.c @@ -204,6 +204,9 @@ nfs_program_register_portmap_all (struct nfs_state *nfs) if (nfs->override_portnum) prog->progport = nfs->override_portnum; (void) rpcsvc_program_register_portmap (prog, prog->progport); +#ifdef IPV6_DEFAULT + (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport); +#endif } return (0); @@ -339,6 +342,17 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this) if (version->required) goto err; } +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_register_rpcbind6 (prog, + prog->progport); + if (ret == -1) { + gf_msg (GF_NFS, GF_LOG_ERROR, 0, + NFS_MSG_PGM_REG_FAIL, + "Program (ipv6) %s registration failed", + prog->progname); + goto err; + } +#endif } } @@ -901,6 +915,16 @@ nfs_init_state (xlator_t *this) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (this->options, "transport.address-family", + "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error"); + goto free_foppool; + } +#endif + + /* Right only socket support exists between nfs client and * gluster nfs, so we can set default value as socket */ @@ -2019,7 +2043,7 @@ struct volume_options options[] = { }, { .key = {"nfs.mount-rmtab"}, .type = GF_OPTION_TYPE_PATH, - .default_value = NFS_DATADIR "/rmtab", + .default_value = "/-", .description = "Set the location of the cache file that is used to " "list all the NFS-clients that have connected " "through the MOUNT protocol. If this is on shared " diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c index 64287c5b1bd..5aa9ea4e76e 100644 --- a/xlators/nfs/server/src/nfs3.c +++ b/xlators/nfs/server/src/nfs3.c @@ -372,6 +372,28 @@ out: } while (0) \ +/* + * This macro checks if the volume is started or not. + * If it is not started, it closes the client connection & logs it. + * + * Why do we do this? + * + * There is a "race condition" where gNFSd may start listening for RPC requests + * prior to the volume being started. Presumably, that is why this macro exists + * in the first place. In the NFS kernel client (specifically Linux's NFS + * kernel client), they establish a TCP connection to our endpoint and + * (re-)send requests. If we ignore the request, and return nothing back, + * the NFS kernel client waits forever for our response. If for some reason, + * the TCP connection were to die, and re-establish, the requests are + * retransmitted and everything begins working as expected + * + * Now, this is clearly bad behavior on the client side, + * but in order to make every user's life easier, + * gNFSd should simply disconnect the TCP connection if it sees requests + * before it is ready to accept them. + * + */ + #define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl) \ do { \ if ((!nfs_subvolume_started (nfs_state (nf3stt->nfsx), vlm))){\ @@ -379,11 +401,32 @@ out: NFS_MSG_VOL_DISABLE, \ "Volume is disabled: %s", \ vlm->name); \ + nfs3_disconnect_transport (req->trans); \ rtval = RPCSVC_ACTOR_IGNORE; \ goto erlbl; \ } \ } while (0) \ +void +nfs3_disconnect_transport (rpc_transport_t *transport) +{ + int ret = 0; + + GF_VALIDATE_OR_GOTO (GF_NFS3, transport, out); + + ret = rpc_transport_disconnect (transport); + if (ret != 0) { + gf_log (GF_NFS3, GF_LOG_WARNING, + "Unable to close client connection to %s.", + transport->peerinfo.identifier); + } else { + gf_log (GF_NFS3, GF_LOG_WARNING, + "Closed client connection to %s.", + transport->peerinfo.identifier); + } +out: + return; +} int nfs3_export_sync_trusted (struct nfs3_state *nfs3, uuid_t exportid) @@ -778,6 +821,12 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, cs = frame->local; if (op_ret == -1) { + /* Prevent crashes for the case where this call fails + * and buf is left in a NULL state, yet the op_errno == 0. + */ + if (!buf && op_errno == 0) { + op_errno = EIO; + } status = nfs3_cbk_errno_status (op_ret, op_errno); } diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index c81a97d8a39..5ab38890df3 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -162,8 +162,6 @@ iot_worker (void *data) THIS = this; for (;;) { - sleep_till.tv_sec = time (NULL) + conf->idle_time; - pthread_mutex_lock (&conf->mutex); { if (pri != -1) { @@ -171,8 +169,11 @@ iot_worker (void *data) pri = -1; } while (conf->queue_size == 0) { - conf->sleep_count++; + clock_gettime (CLOCK_REALTIME_COARSE, + &sleep_till); + sleep_till.tv_sec += conf->idle_time; + conf->sleep_count++; ret = pthread_cond_timedwait (&conf->cond, &conf->mutex, &sleep_till); @@ -202,7 +203,7 @@ iot_worker (void *data) &conf->mutex, &sleep); pthread_mutex_unlock(&conf->mutex); continue; - } + } } pthread_mutex_unlock (&conf->mutex); @@ -228,14 +229,25 @@ int do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri) { int ret = 0; + int active_count = 0; pthread_mutex_lock (&conf->mutex); { __iot_enqueue (conf, stub, pri); - pthread_cond_signal (&conf->cond); - - ret = __iot_workers_scale (conf); + /* If we have an ample supply of threads alive already + * it's massively more efficient to keep the ones you have + * busy vs making new ones and signaling everyone + */ + active_count = conf->curr_count - conf->sleep_count; + if (conf->fops_per_thread_ratio == 0 || active_count == 0 || + (conf->queue_size/active_count > + conf->fops_per_thread_ratio && + active_count < conf->max_count)) { + pthread_cond_signal (&conf->cond); + + ret = __iot_workers_scale (conf); + } } pthread_mutex_unlock (&conf->mutex); @@ -900,6 +912,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); + GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio, + options, int32, out); + GF_OPTION_RECONF ("high-prio-threads", conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); @@ -972,6 +987,9 @@ init (xlator_t *this) GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); + GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio, + int32, out); + GF_OPTION_INIT ("high-prio-threads", conf->ac_iot_limit[IOT_PRI_HI], int32, out); @@ -1096,6 +1114,20 @@ struct volume_options options[] = { "perform concurrent IO operations" }, + { .key = {"fops-per-thread-ratio"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_FOP_PER_THREAD, + .max = IOT_MAX_FOP_PER_THREAD, + .default_value = "20", + .description = "The optimal ratio of threads to FOPs in the queue " + "we wish to achieve before creating a new thread. " + "The idea here is it's far cheaper to keep our " + "currently running threads busy than spin up " + "new threads or cause a stampeding herd of threads " + "to service a singlular FOP when you have a thread " + "which will momentarily become available to do the " + "work." + }, { .key = {"high-prio-threads"}, .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index d8eea2cf77a..e5c97f690a2 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -34,7 +34,9 @@ struct iot_conf; #define IOT_MIN_THREADS 1 #define IOT_DEFAULT_THREADS 16 -#define IOT_MAX_THREADS 64 +#define IOT_MAX_THREADS 256 +#define IOT_MIN_FOP_PER_THREAD 0 +#define IOT_MAX_FOP_PER_THREAD 2000 #define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) @@ -62,6 +64,7 @@ struct iot_conf { pthread_cond_t cond; int32_t max_count; /* configured maximum */ + int32_t fops_per_thread_ratio; int32_t curr_count; /* actual number of threads running */ int32_t sleep_count; diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 30443761c56..c3baafdc1b6 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -33,6 +33,7 @@ struct mdc_conf { gf_boolean_t cache_selinux; gf_boolean_t force_readdirp; gf_boolean_t cache_swift_metadata; + gf_boolean_t cache_all_xattrs; }; @@ -792,6 +793,7 @@ struct checkpair { static int is_mdc_key_satisfied (const char *key) { + unsigned int checked_keys = 0; const char *mdc_key = NULL; int i = 0; @@ -801,11 +803,13 @@ is_mdc_key_satisfied (const char *key) for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { if (!mdc_keys[i].load) continue; + + checked_keys++; if (strcmp (mdc_key, key) == 0) return 1; } - return 0; + return 0; } @@ -875,7 +879,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_rsp = NULL; dict_t *xattr_alloc = NULL; mdc_local_t *local = NULL; - + struct mdc_conf *conf = this->private; local = mdc_local_get (frame); if (!local) @@ -899,10 +903,17 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (ret != 0) goto uncached; - if (!mdc_xattr_satisfied (this, xdata, xattr_rsp)) + /* Only check the keys if we are not caching all the xattrs */ + if (!conf->cache_all_xattrs && + !mdc_xattr_satisfied (this, xdata, xattr_rsp)) { goto uncached; + } } + gf_msg (this->name, GF_LOG_TRACE, 0, 0, + "Returning lookup from cache for gfid %s", + uuid_utoa(loc->inode->gfid)); + MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf, xattr_rsp, &postparent); @@ -1882,6 +1893,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, int op_errno = ENODATA; mdc_local_t *local = NULL; dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; local = mdc_local_get (frame); if (!local) @@ -1897,7 +1909,18 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, goto uncached; if (!xattr || !dict_get (xattr, (char *)key)) { - ret = -1; + /* If we can't find the extended attribute, & cache-all-xattrs + * is enabled, we should wind and try to find them. + * + * NOTE: Quota & AFR queries through the mount + * (i.e, virtual Gluster xattrs) + * won't work unless we do this. + */ + if (conf->cache_all_xattrs) { + goto uncached; + } + + ret = -1; op_errno = ENODATA; } @@ -2363,7 +2386,8 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out); - + GF_OPTION_RECONF("cache-all-xattrs", conf->cache_all_xattrs, options, + bool, out); out: return 0; } @@ -2404,6 +2428,7 @@ init (xlator_t *this) conf->cache_swift_metadata); GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); + GF_OPTION_INIT ("cache-all-xattrs", conf->cache_all_xattrs, bool, out); out: this->private = conf; @@ -2474,7 +2499,7 @@ struct volume_options options[] = { { .key = {"md-cache-timeout"}, .type = GF_OPTION_TYPE_INT, .min = 0, - .max = 60, + .max = 300, .default_value = "1", .description = "Time period after which cache has to be refreshed", }, @@ -2484,5 +2509,19 @@ struct volume_options options[] = { .description = "Convert all readdir requests to readdirplus to " "collect stat info on each entry.", }, + { .key = {"strict-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "When reading extended attributes from the cache, " + "if an xattr is not found, attempt to find it by winding " + "instead of returning ENODATA. This is necessary to query " + "the special extended attributes (trusted.glusterfs.quota.size) " + "through a FUSE mount with md-cache enabled." + }, + { .key = {"cache-all-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Cache all the extended attributes for an inode.", + }, { .key = {NULL} }, }; diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 7f5719b1e48..bc59036ff88 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -169,6 +169,7 @@ typedef struct wb_request { typedef struct wb_conf { uint64_t aggregate_size; + uint64_t page_size; uint64_t window_size; gf_boolean_t flush_behind; gf_boolean_t trickling_writes; @@ -1207,18 +1208,21 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req) char *ptr = NULL; struct iobuf *iobuf = NULL; struct iobref *iobref = NULL; + struct wb_conf *conf = NULL; int ret = -1; ssize_t required_size = 0; size_t holder_len = 0; size_t req_len = 0; + conf = req->wb_inode->this->private; + if (!holder->iobref) { holder_len = iov_length (holder->stub->args.vector, holder->stub->args.count); req_len = iov_length (req->stub->args.vector, req->stub->args.count); - required_size = max ((THIS->ctx->page_size), + required_size = max ((conf->page_size), (holder_len + req_len)); iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool, required_size); @@ -1281,7 +1285,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) wb_request_t *holder = NULL; wb_conf_t *conf = NULL; int ret = 0; - ssize_t page_size = 0; /* With asynchronous IO from a VM guest (as a file), there can be two sequential writes happening in two regions @@ -1292,7 +1295,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) through the interleaved ops */ - page_size = wb_inode->this->ctx->page_size; conf = wb_inode->this->private; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { @@ -1343,7 +1345,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) continue; } - space_left = page_size - holder->write_size; + space_left = wb_inode->window_conf - holder->write_size; if (space_left < req->write_size) { holder->ordering.go = 1; @@ -2471,6 +2473,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out); + GF_OPTION_RECONF ("cache-size", conf->page_size, options, size_uint64, + out); + GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, out); @@ -2522,6 +2527,7 @@ init (xlator_t *this) /* configure 'option window-size <size>' */ GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out); + GF_OPTION_INIT ("cache-size", conf->page_size, size_uint64, out); if (!conf->window_size && conf->aggregate_size) { gf_msg (this->name, GF_LOG_WARNING, 0, diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index dc6e244e717..7732a9711ae 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -15,6 +15,7 @@ #include "glusterfs.h" #include "statedump.h" #include "compat-errno.h" +#include "latency.h" #include "glusterfs3.h" #include "portmap-xdr.h" @@ -1549,7 +1550,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi rpc_clnt_reconfig (conf->rpc, &config); conf->skip_notify = 1; - conf->quick_reconnect = 1; + conf->quick_reconnect = 1; out: if (frame) diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 3cb5e231fbe..3e18b4870ae 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -467,7 +467,7 @@ int32_t client_forget (xlator_t *this, inode_t *inode) { /* Nothing here */ - return 0; + return 0; } int32_t @@ -545,7 +545,7 @@ out: STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -571,7 +571,7 @@ out: if (ret) STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -600,7 +600,7 @@ out: STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -628,7 +628,7 @@ out: if (ret) STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -657,7 +657,7 @@ out: if (ret) STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -687,7 +687,7 @@ out: if (ret) STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -718,7 +718,7 @@ out: STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -748,7 +748,7 @@ out: STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -778,7 +778,7 @@ out: STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -807,7 +807,7 @@ out: STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -837,7 +837,7 @@ out: STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -867,7 +867,7 @@ out: STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -897,7 +897,7 @@ out: STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -932,7 +932,7 @@ out: STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -965,7 +965,7 @@ out: if (ret) STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1000,7 +1000,7 @@ out: STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; } @@ -1038,7 +1038,7 @@ out: if (ret) STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1064,7 +1064,7 @@ out: if (ret) STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1093,7 +1093,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1120,7 +1120,7 @@ out: if (ret) STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1149,7 +1149,7 @@ out: if (ret) STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1177,7 +1177,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1204,7 +1204,7 @@ out: if (ret) STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } static gf_boolean_t @@ -1393,7 +1393,7 @@ out: if (need_unwind) STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); - return 0; + return 0; } @@ -1423,7 +1423,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1453,7 +1453,7 @@ out: if (ret) STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1482,7 +1482,7 @@ out: if (ret) STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1512,7 +1512,7 @@ out: if (ret) STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1542,7 +1542,7 @@ out: if (ret) STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1571,7 +1571,7 @@ out: if (ret) STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1598,7 +1598,7 @@ out: if (ret) STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1654,7 +1654,7 @@ out: if (ret) STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1684,7 +1684,7 @@ out: if (ret) STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1715,7 +1715,7 @@ out: if (ret) STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1747,7 +1747,7 @@ out: if (ret) STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1780,7 +1780,7 @@ out: if (ret) STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1809,7 +1809,7 @@ out: if (ret) STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL); - return 0; + return 0; } int32_t @@ -1840,7 +1840,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1872,7 +1872,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1901,7 +1901,7 @@ out: if (ret) STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -1929,7 +1929,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -2155,7 +2155,7 @@ out: if (ret) STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL); - return 0; + return 0; } @@ -2227,6 +2227,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf = this->private; switch (event) { + case RPC_CLNT_PING: + { + ret = default_notify (this, GF_EVENT_CHILD_PING, NULL); + if (ret) + gf_log (this->name, GF_LOG_INFO, + "CHILD_PING notify failed"); + conf->last_sent_event = GF_EVENT_CHILD_PING; + break; + } case RPC_CLNT_CONNECT: { conf->connected = 1; @@ -2312,13 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf->connected = 0; conf->skip_notify = 0; - if (conf->quick_reconnect) { - conf->quick_reconnect = 0; - rpc_clnt_start (rpc); - - } else { + if (conf->rpc->conn.connected) { + /* Having conf->connected false and + * conf->rpc->conn.connected true is an + * unrecoverable state, since rpc_clnt_reconnect + * will do nothing for an already connected connection. + * A good fix would be to ensure serialized + * delivery of transport messages, but that is super hard + * and this is rare. So... ghetto "fix", disconnect the + * RPC and start the race again. Maybe we'll win + * next time! + */ + gf_log (this->name, GF_LOG_WARNING, + "Client %s reconnect race detected, " + "restarting.", conf->rpc->conn.name); + conf->quick_reconnect = 1; + rpc_transport_disconnect (rpc->conn.trans); rpc->conn.config.remote_port = 0; - + } else { + if (conf->quick_reconnect) { + conf->quick_reconnect = 0; + rpc_clnt_start (rpc); + } else { + rpc->conn.config.remote_port = 0; + } } break; @@ -2670,7 +2696,7 @@ reconfigure (xlator_t *this, dict_t *options) ret = 0; out: - return ret; + return ret; } @@ -2724,6 +2750,8 @@ init (xlator_t *this) this->private = conf; + this->client_latency.min = UINT64_MAX; + /* If it returns -1, then its a failure, if it returns +1 we need have to understand that 'this' is subvolume of a xlator which, will set the remote host and remote subvolume in a setxattr @@ -3001,7 +3029,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_TIME, .min = 0, .max = 1013, - .default_value = "42", + .default_value = "180", .description = "Time duration for which the client waits to " "check if the server is responsive." }, diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index d8ef5f7b73f..636108affbb 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + goto err; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index cecf5dcb66d..c40a087ec46 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -658,6 +658,81 @@ out: return 0; } +static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats, + double min_free_disk, + gf_boolean_t previously_ok) +{ + gf_boolean_t currently_ok; + + if (min_free_disk < 100.0) { + double free_percent = 100.0 * stats->f_bavail / stats->f_blocks; + + currently_ok = + free_percent >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free percent " + "%f%% < %f%%. Writes disabled.", + free_percent, min_free_disk); + } + } else { + double free_bytes = stats->f_bavail * stats->f_frsize; + + currently_ok = + free_bytes >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free bytes %f " + "< %f. Writes disabled.", + free_bytes, min_free_disk); + } + } + + if (currently_ok && !previously_ok) { + gf_log (this->name, GF_LOG_INFO, "Free space has risen above " + "min-free-disk limit, writes " + "re-enabled."); + } + + return currently_ok; +} + +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv) +{ + /* Check if there is sufficient free space to allow writes. + * + * This is called in the write path, so performance matters. We + * periodically sample free space by calling statvfs(). + * freespace_check_lock is used to ensure only one process at a + * time makes the call; if the lock is contended, the previous + * status (reflected in freespace_check_passed) is used while + * the process that holds the mutex updates the current status. + */ + if (!priv->freespace_check_interval) { + return _gf_true; + } + + if (!pthread_mutex_trylock (&priv->freespace_check_lock)) { + struct timespec now; + + clock_gettime (CLOCK_MONOTONIC, &now); + if (now.tv_sec >= priv->freespace_check_last.tv_sec + + priv->freespace_check_interval) { + sys_statvfs (priv->base_path, &priv->freespace_stats); + priv->freespace_check_last.tv_sec = now.tv_sec; + + priv->freespace_check_passed = freespace_ok ( + this, &priv->freespace_stats, priv->min_free_disk, + priv->freespace_check_passed); + } + + pthread_mutex_unlock (&priv->freespace_check_lock); + } + + return priv->freespace_check_passed; +} + static int32_t posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, off_t offset, size_t len, @@ -667,6 +742,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t op_errno = 0; struct posix_fd *pfd = NULL; gf_boolean_t locked = _gf_false; + struct posix_private *priv = this->private; DECLARE_OLD_FS_ID_VAR; @@ -675,6 +751,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + if (!posix_write_ok (this, priv)) { + ret = -ENOSPC; + goto out; + } ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { @@ -3307,6 +3389,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (priv, out); + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + op_ret = -1; + goto out; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, @@ -6671,6 +6759,16 @@ struct posix_private *priv = NULL; options, uint32, out); posix_spawn_health_check_thread (this); + pthread_mutex_lock (&priv->freespace_check_lock); + { + GF_OPTION_RECONF ("freespace-check-interval", + priv->freespace_check_interval, + options, uint32, out); + GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options, + percent_or_size, out); + } + pthread_mutex_unlock (&priv->freespace_check_lock); + ret = 0; out: return ret; @@ -7285,6 +7383,19 @@ init (xlator_t *this) GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, uint32, out); + + GF_OPTION_INIT ("freespace-check-interval", + _private->freespace_check_interval, uint32, out); + + GF_OPTION_INIT ("min-free-disk", _private->min_free_disk, + percent_or_size, out); + + pthread_mutex_init (&_private->freespace_check_lock, NULL); + sys_statvfs (_private->base_path, &_private->freespace_stats); + clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last); + _private->freespace_check_passed = freespace_ok ( + this, &_private->freespace_stats, _private->min_free_disk, + _gf_true); out: return ret; } @@ -7462,5 +7573,22 @@ struct volume_options options[] = { "\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n" }, #endif + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "2%", + .description = "Minimum percentage/size of disk space, after which we" + "start failing writes with ENOSPC." + }, + { + .key = {"freespace-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "5", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds between freespace measurements " + "used for the min-free-disk determination. " + "Set to 0 to disable." + }, + { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 87f91e57747..ef4bc66ecbc 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -174,7 +174,14 @@ struct posix_private { XATTR_BOTH, } xattr_user_namespace; #endif - + /* freespace_check_lock protects access to following three fields. */ + pthread_mutex_t freespace_check_lock; + struct timespec freespace_check_last; + struct statvfs freespace_stats; + double min_free_disk; + /* mutex protection ends. */ + uint32_t freespace_check_interval; + gf_boolean_t freespace_check_passed; }; typedef struct { @@ -263,6 +270,9 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, void posix_gfid_unset (xlator_t *this, dict_t *xdata); +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv); + int posix_pacl_set (const char *path, const char *key, const char *acl_s); |