summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--api/src/glfs-mgmt.c3
-rw-r--r--cli/src/cli.c8
-rw-r--r--configure.ac34
-rw-r--r--glusterfs.spec.in13
-rw-r--r--glusterfsd/src/glusterfsd-mgmt.c65
-rw-r--r--glusterfsd/src/glusterfsd.c10
-rw-r--r--glusterfsd/src/glusterfsd.h2
-rw-r--r--libglusterfs/src/common-utils.c28
-rw-r--r--libglusterfs/src/compat.h8
-rw-r--r--libglusterfs/src/dict.c95
-rw-r--r--libglusterfs/src/dict.h9
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--libglusterfs/src/iobuf.c4
-rw-r--r--libglusterfs/src/latency.c12
-rw-r--r--libglusterfs/src/mem-pool.c10
-rw-r--r--libglusterfs/src/mem-types.h1
-rw-r--r--libglusterfs/src/timespec.c12
-rw-r--r--libglusterfs/src/timespec.h3
-rw-r--r--libglusterfs/src/xlator.c16
-rw-r--r--libglusterfs/src/xlator.h1
-rwxr-xr-xrfc.sh2
-rw-r--r--rpc/rpc-lib/src/rpc-clnt-ping.c87
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.h1
-rw-r--r--rpc/rpc-lib/src/rpc-transport.c44
-rw-r--r--rpc/rpc-lib/src/rpc-transport.h3
-rw-r--r--rpc/rpc-lib/src/rpcsvc.c89
-rw-r--r--rpc/rpc-lib/src/rpcsvc.h5
-rw-r--r--rpc/rpc-transport/rdma/src/name.c5
-rw-r--r--rpc/rpc-transport/socket/src/name.c18
-rw-r--r--rpc/rpc-transport/socket/src/socket.c17
-rw-r--r--rpc/xdr/src/glusterfs-fops.x1
-rwxr-xr-xrun-tests.sh6
-rw-r--r--tests/basic/accept-v6v4.t122
-rwxr-xr-xtests/basic/dht-min-free-space.t78
-rw-r--r--tests/basic/ec/ec-common2
-rw-r--r--tests/basic/ec/self-heal.t2
-rw-r--r--tests/basic/exports_parsing.t15
-rw-r--r--tests/basic/fop-sampling.t78
-rwxr-xr-xtests/basic/fops-sanity-gfproxy.t32
-rw-r--r--tests/basic/gfproxy.t74
-rw-r--r--tests/basic/glusterd/volfile_server_switch.t3
-rw-r--r--tests/basic/halo-failover-disabled.t77
-rw-r--r--tests/basic/halo-failover-enabled.t87
-rw-r--r--tests/basic/halo-hybrid.t70
-rw-r--r--tests/basic/halo.t51
-rwxr-xr-xtests/basic/mount-nfs-auth.t12
-rw-r--r--tests/basic/write-behind.t53
-rw-r--r--tests/bugs/distribute/bug-1099890.t2
-rwxr-xr-xtests/bugs/distribute/bug-1161311.t10
-rw-r--r--tests/bugs/fuse/bug-858488-min-free-disk.t1
-rw-r--r--tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t22
-rwxr-xr-xtests/bugs/glusterd/bug-859927.t8
-rw-r--r--tests/cluster.rc9
-rw-r--r--tests/configfiles/exports-v61
-rw-r--r--tests/env.rc.in3
-rwxr-xr-xtests/features/brick-min-free-space.t113
-rw-r--r--tests/features/lock_revocation.t52
-rw-r--r--tests/halo.rc52
-rw-r--r--tests/include.rc3
-rw-r--r--xlators/cluster/Makefile.am2
-rw-r--r--xlators/cluster/afr/src/afr-common.c734
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c4
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h1
-rw-r--r--xlators/cluster/afr/src/afr.c164
-rw-r--r--xlators/cluster/afr/src/afr.h30
-rw-r--r--xlators/cluster/aha/Makefile.am3
-rw-r--r--xlators/cluster/aha/src/Makefile.am18
-rw-r--r--xlators/cluster/aha/src/aha-fops.c952
-rw-r--r--xlators/cluster/aha/src/aha-fops.h360
-rw-r--r--xlators/cluster/aha/src/aha-helpers.c46
-rw-r--r--xlators/cluster/aha/src/aha-helpers.h23
-rw-r--r--xlators/cluster/aha/src/aha-mem-types.h22
-rw-r--r--xlators/cluster/aha/src/aha-retry.c524
-rw-r--r--xlators/cluster/aha/src/aha-retry.h12
-rw-r--r--xlators/cluster/aha/src/aha.c345
-rw-r--r--xlators/cluster/aha/src/aha.h46
-rw-r--r--xlators/cluster/dht/src/dht-common.c41
-rw-r--r--xlators/cluster/dht/src/dht-common.h6
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c53
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c10
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c13
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c2
-rw-r--r--xlators/cluster/dht/src/dht-shared.c32
-rw-r--r--xlators/cluster/dht/src/nufa.c10
-rw-r--r--xlators/cluster/dht/src/switch.c10
-rw-r--r--xlators/debug/io-stats/src/io-stats.c678
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.c1
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c2
-rw-r--r--xlators/features/locks/src/clear.c4
-rw-r--r--xlators/features/locks/src/common.c13
-rw-r--r--xlators/features/locks/src/common.h3
-rw-r--r--xlators/features/locks/src/entrylk.c124
-rw-r--r--xlators/features/locks/src/inodelk.c125
-rw-r--r--xlators/features/locks/src/locks.h4
-rw-r--r--xlators/features/locks/src/posix.c56
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c14
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c39
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h8
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c316
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c110
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h5
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in31
-rw-r--r--xlators/nfs/server/src/exports.h2
-rw-r--r--xlators/nfs/server/src/mount3.c11
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c10
-rw-r--r--xlators/nfs/server/src/nfs-common.c6
-rw-r--r--xlators/nfs/server/src/nfs.c26
-rw-r--r--xlators/nfs/server/src/nfs3.c49
-rw-r--r--xlators/performance/io-threads/src/io-threads.c46
-rw-r--r--xlators/performance/io-threads/src/io-threads.h5
-rw-r--r--xlators/performance/md-cache/src/md-cache.c51
-rw-r--r--xlators/performance/write-behind/src/write-behind.c14
-rw-r--r--xlators/protocol/client/src/client-handshake.c3
-rw-r--r--xlators/protocol/client/src/client.c130
-rw-r--r--xlators/storage/posix/src/posix-aio.c5
-rw-r--r--xlators/storage/posix/src/posix.c128
-rw-r--r--xlators/storage/posix/src/posix.h12
122 files changed, 6687 insertions, 514 deletions
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c
index 8c9872cfa53..5d08114c8c5 100644
--- a/api/src/glfs-mgmt.c
+++ b/api/src/glfs-mgmt.c
@@ -911,7 +911,8 @@ glfs_mgmt_init (struct glfs *fs)
if (!strcmp (cmd_args->volfile_server_transport, "unix")) {
ret = rpc_transport_unix_options_build (&options, host, 0);
} else {
- ret = rpc_transport_inet_options_build (&options, host, port);
+ ret = rpc_transport_inet_options_build (&options, host, port,
+ NULL);
}
if (ret)
diff --git a/cli/src/cli.c b/cli/src/cli.c
index 2ecaae415d6..fa507309e80 100644
--- a/cli/src/cli.c
+++ b/cli/src/cli.c
@@ -586,6 +586,11 @@ cli_rpc_init (struct cli_state *state)
int ret = -1;
int port = CLI_GLUSTERD_PORT;
xlator_t *this = NULL;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+#else
+ char *addr_family = "inet";
+#endif
this = THIS;
cli_rpc_prog = &cli_prog;
@@ -621,7 +626,8 @@ cli_rpc_init (struct cli_state *state)
goto out;
ret = dict_set_str (options, "transport.address-family",
- "inet");
+ addr_family);
+
if (ret)
goto out;
}
diff --git a/configure.ac b/configure.ac
index cfefa218156..86c6bcfcc4d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -72,6 +72,8 @@ AC_CONFIG_FILES([Makefile
xlators/cluster/Makefile
xlators/cluster/afr/Makefile
xlators/cluster/afr/src/Makefile
+ xlators/cluster/aha/Makefile
+ xlators/cluster/aha/src/Makefile
xlators/cluster/stripe/Makefile
xlators/cluster/stripe/src/Makefile
xlators/cluster/dht/Makefile
@@ -275,7 +277,19 @@ if test "x$enable_debug" = "xyes"; then
CFLAGS="${CFLAGS} -g -O0 -DDEBUG"
else
BUILD_DEBUG=no
- CFLAGS="${CFLAGS} -g -O2"
+ CFLAGS="${CFLAGS} -g"
+fi
+
+AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.]))
+if test "x$with_fbextras" = "xyes"; then
+ BUILD_FBEXTRAS=yes
+else
+ BUILD_FBEXTRAS=no
+fi
+
+AC_ARG_ENABLE([privport_prefer], AC_HELP_STRING([--disable-privport_prefer], [Disable preferred usage of privleged ports.]))
+if test "x$enable_privport_prefer" = "xno"; then
+ CFLAGS="${CFLAGS} -DNO_PRIVPORT"
fi
case $host_os in
@@ -908,6 +922,16 @@ AC_SUBST(GF_DISTRIBUTION)
GF_HOST_OS=""
GF_LDFLAGS="-rdynamic"
+TESTER_CFLAGS=""
+
+dnl include tirpc for FB builds
+if test "x$BUILD_FBEXTRAS" = "xyes"; then
+ TIRPC_CFLAGS="-I/usr/include/tirpc"
+ GF_LDFLAGS="-lfbtirpc $GF_LDFLAGS"
+ GF_CFLAGS="$GF_CFLAGS $TIRPC_CFLAGS -DIPV6_DEFAULT"
+ TESTER_CFLAGS="$TESTER_CFLAGS -lfbtirpc"
+fi
+
dnl check for gcc -Werror=format-security
saved_CFLAGS=$CFLAGS
CFLAGS="-Wformat -Werror=format-security"
@@ -1099,6 +1123,12 @@ AC_ARG_ENABLE([debug],
AC_HELP_STRING([--enable-debug],
[Enable debug build options.]))
+AC_ARG_ENABLE([mempool],
+ AC_HELP_STRING([--disable-mempool],
+ [Disable the Gluster memory pooler.]))
+if test "x$enable_mempool" = "xno"; then
+ CFLAGS="${CFLAGS} -DDISABLE_MEMPOOL"
+fi
# syslog section
AC_ARG_ENABLE([syslog],
@@ -1294,12 +1324,14 @@ AC_SUBST([GF_CPPFLAGS])
AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS")
AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS")
AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS")
+AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes")
AC_SUBST(GLUSTERD_WORKDIR)
AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd )
AC_SUBST(GLUSTERD_VOLFILE)
AC_SUBST(GLUSTERFS_LIBEXECDIR)
AC_SUBST(GLUSTERFSD_MISCDIR)
+AC_SUBST(TESTER_CFLAGS)
dnl pkg-config versioning
dnl
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
index b28bb426555..29bf00c60a9 100644
--- a/glusterfs.spec.in
+++ b/glusterfs.spec.in
@@ -13,6 +13,10 @@
# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with debug
%{?_with_debug:%global _with_debug --enable-debug}
+# if you wish to compile an rpm with Facebook specfic extras...
+# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras
+%{?_with_fbextras:%global _with_fbextras --with-fbextras}
+
# if you wish to compile an rpm with cmocka unit testing...
# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka
%{?_with_cmocka:%global _with_cmocka --enable-cmocka}
@@ -196,6 +200,9 @@ BuildRequires: libxml2-devel openssl-devel
BuildRequires: libaio-devel libacl-devel
BuildRequires: python-devel
BuildRequires: python-ctypes
+%if ( 0%{?_with_fbextras:1} )
+BuildRequires: fb-libtirpc fb-libtirpc-devel
+%endif
BuildRequires: userspace-rcu-devel >= 0.7
%if ( 0%{?rhel} && 0%{?rhel} <= 6 )
BuildRequires: automake
@@ -513,6 +520,9 @@ Requires: %{name}-cli%{?_isa} = %{version}-%{release}
Requires: %{name}-libs%{?_isa} = %{version}-%{release}
# some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse
Requires: %{name}-fuse%{?_isa} = %{version}-%{release}
+%if ( 0%{?_with_fbextras:1} )
+Requires: fb-libtirpc >= 0.2.5-1
+%endif
# self-heal daemon, rebalance, nfs-server etc. are actually clients
Requires: %{name}-api%{?_isa} = %{version}-%{release}
Requires: %{name}-client-xlators%{?_isa} = %{version}-%{release}
@@ -596,7 +606,8 @@ export CFLAGS
%{?_without_ocf} \
%{?_without_rdma} \
%{?_without_syslog} \
- %{?_without_tiering}
+ %{?_without_tiering} \
+ %{?_with_fbextras}
# fix hardening and remove rpath in shlibs
%if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 )
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
index c47fa3883c9..556b82742cb 100644
--- a/glusterfsd/src/glusterfsd-mgmt.c
+++ b/glusterfsd/src/glusterfsd-mgmt.c
@@ -1903,9 +1903,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
switch (event) {
case RPC_CLNT_DISCONNECT:
- GF_LOG_OCCASIONALLY (log_ctr1, "glusterfsd-mgmt", GF_LOG_ERROR,
- "failed to connect with remote-host: %s (%s)",
- ctx->cmd_args.volfile_server, strerror (errno));
+ ctx->cmd_args.connect_attempts++;
+
+ gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
+ "Connect attempt with remote-host: %s (%s) (%u/%d)",
+ ctx->cmd_args.volfile_server,
+ strerror (errno),
+ ctx->cmd_args.connect_attempts,
+ ctx->cmd_args.max_connect_attempts);
if (!rpc->disabled) {
/*
* Check if dnscache is exhausted for current server
@@ -1916,8 +1921,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
break;
}
}
+
+ /* If we run out of servers, AND we attempted to connect
+ * max connect times, then we should return ENOTCONN
+ */
server = ctx->cmd_args.curr_server;
- if (server->list.next == &ctx->cmd_args.volfile_servers) {
+ if ((ctx->cmd_args.connect_attempts >=
+ ctx->cmd_args.max_connect_attempts) &&
+ server->list.next == &ctx->cmd_args.volfile_servers) {
if (!ctx->active)
need_term = 1;
emval = ENOTCONN;
@@ -1926,24 +1937,33 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
"Exhausted all volfile servers");
break;
}
- server = list_entry (server->list.next, typeof(*server), list);
- ctx->cmd_args.curr_server = server;
- ctx->cmd_args.volfile_server = server->volfile_server;
-
- ret = dict_set_str (rpc_trans->options, "remote-host",
- server->volfile_server);
- if (ret != 0) {
- gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
- "failed to set remote-host: %s",
+
+ /* If we exceed the # of connect attempts, we should
+ * move onto the next server
+ */
+ if (ctx->cmd_args.connect_attempts >=
+ ctx->cmd_args.max_connect_attempts || !server) {
+ server = list_entry (server->list.next,
+ typeof(*server), list);
+ ctx->cmd_args.curr_server = server;
+ ctx->cmd_args.volfile_server = server->volfile_server;
+
+ ret = dict_set_str (rpc_trans->options, "remote-host",
+ server->volfile_server);
+ if (ret != 0) {
+ gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
+ "failed to set remote-host: %s",
+ server->volfile_server);
+ if (!ctx->active)
+ need_term = 1;
+ emval = ENOTCONN;
+ break;
+ }
+ ctx->cmd_args.connect_attempts = 0;
+ gf_log ("glusterfsd-mgmt", GF_LOG_INFO,
+ "connecting to next volfile server %s",
server->volfile_server);
- if (!ctx->active)
- need_term = 1;
- emval = ENOTCONN;
- break;
}
- gf_log ("glusterfsd-mgmt", GF_LOG_INFO,
- "connecting to next volfile server %s",
- server->volfile_server);
break;
case RPC_CLNT_CONNECT:
rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn);
@@ -1960,7 +1980,7 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
}
}
-
+ ctx->cmd_args.connect_attempts = 0;
if (is_mgmt_rpc_reconnect)
glusterfs_mgmt_pmap_signin (ctx);
@@ -2136,7 +2156,8 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx)
!strcmp (cmd_args->volfile_server_transport, "unix")) {
ret = rpc_transport_unix_options_build (&options, host, 0);
} else {
- ret = rpc_transport_inet_options_build (&options, host, port);
+ ret = rpc_transport_inet_options_build (&options, host, port,
+ NULL);
}
if (ret)
goto out;
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
index 6c7a7c883fa..5022cfc22da 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -986,7 +986,7 @@ parse_opts (int key, char *arg, struct argp_state *state)
cmd_args->debug_mode = ENABLE_DEBUG_MODE;
break;
case ARGP_VOLFILE_MAX_FETCH_ATTEMPTS:
- cmd_args->max_connect_attempts = 1;
+ cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;
break;
case ARGP_DIRECT_IO_MODE_KEY:
@@ -1955,13 +1955,7 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)
}
}
- /*
- This option was made obsolete but parsing it for backward
- compatibility with third party applications
- */
- if (cmd_args->max_connect_attempts) {
- gf_msg ("glusterfs", GF_LOG_WARNING, 0, glusterfsd_msg_33);
- }
+ cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;
#ifdef GF_DARWIN_HOST_OS
if (cmd_args->mount_point)
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
index e442bede5db..b5c6b27b534 100644
--- a/glusterfsd/src/glusterfsd.h
+++ b/glusterfsd/src/glusterfsd.h
@@ -16,7 +16,7 @@
#define DEFAULT_GLUSTERD_VOLFILE CONFDIR "/glusterd.vol"
#define DEFAULT_CLIENT_VOLFILE CONFDIR "/glusterfs.vol"
#define DEFAULT_SERVER_VOLFILE CONFDIR "/glusterfsd.vol"
-
+#define DEFAULT_MAX_CONNECT_ATTEMPTS 200
#define DEFAULT_EVENT_POOL_SIZE 16384
#define ARGP_LOG_LEVEL_NONE_OPTION "NONE"
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 18f445ae265..6a5889207d4 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -181,26 +181,16 @@ gf_rev_dns_lookup (const char *ip)
{
char *fqdn = NULL;
int ret = 0;
- struct sockaddr_in sa = {0};
- char host_addr[256] = {0, };
GF_VALIDATE_OR_GOTO ("resolver", ip, out);
- sa.sin_family = AF_INET;
- inet_pton (AF_INET, ip, &sa.sin_addr);
- ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr,
- sizeof (host_addr), NULL, 0, 0);
-
+ /* Get the FQDN */
+ ret = gf_get_hostname_from_ip ((char *)ip, &fqdn);
if (ret != 0) {
gf_msg ("resolver", GF_LOG_INFO, errno,
LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve "
"hostname for %s", ip);
- goto out;
}
-
- /* Get the FQDN */
- fqdn = gf_strdup (host_addr);
-
out:
return fqdn;
}
@@ -3107,11 +3097,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
char *client_ip_copy = NULL;
char *tmp = NULL;
char *ip = NULL;
+ size_t addr_sz = 0;
/* if ipv4, reverse lookup the hostname to
* allow FQDN based rpc authentication
*/
- if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) {
+ if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) &&
+ !valid_ipv4_address (client_ip, strlen (client_ip), 0)) {
/* most times, we get a.b.c.d:port form, so check that */
client_ip_copy = gf_strdup (client_ip);
if (!client_ip_copy)
@@ -3124,12 +3116,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) {
client_sockaddr = (struct sockaddr *)&client_sock_in;
+ addr_sz = sizeof (client_sock_in);
client_sock_in.sin_family = AF_INET;
ret = inet_pton (AF_INET, ip,
(void *)&client_sock_in.sin_addr.s_addr);
} else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) {
client_sockaddr = (struct sockaddr *) &client_sock_in6;
+ addr_sz = sizeof (client_sock_in6);
client_sock_in6.sin6_family = AF_INET6;
ret = inet_pton (AF_INET6, ip,
@@ -3143,8 +3137,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
goto out;
}
+ /* You cannot just use sizeof (*client_sockaddr), as per the man page
+ * the (getnameinfo) size must be the size of the underlying sockaddr
+ * struct e.g. sockaddr_in6 or sockaddr_in. Failure to do so will
+ * break IPv6 hostname resolution (IPv4 will work only because
+ * the sockaddr_in struct happens to be of the correct size).
+ */
ret = getnameinfo (client_sockaddr,
- sizeof (*client_sockaddr),
+ addr_sz,
client_hostname, sizeof (client_hostname),
NULL, 0, 0);
if (ret) {
diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h
index ea722028eb5..56736e52052 100644
--- a/libglusterfs/src/compat.h
+++ b/libglusterfs/src/compat.h
@@ -467,6 +467,12 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);
#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0);
#endif
+#ifdef GF_BSD_HOST_OS
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+
+#ifndef IPV6_DEFAULT
+
#ifndef IXDR_GET_LONG
#define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf))
#endif
@@ -483,6 +489,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);
#define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG(buf, (long)(v))
#endif
+#endif /* IPV6_DEFAULT */
+
#if defined(__GNUC__) && !defined(RELAX_POISONING)
/* Use run API, see run.h */
#include <stdlib.h> /* system(), mkostemp() */
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c
index 25ddff0d8c4..6a61e641e19 100644
--- a/libglusterfs/src/dict.c
+++ b/libglusterfs/src/dict.c
@@ -27,6 +27,45 @@
#include "statedump.h"
#include "libglusterfs-messages.h"
+/* this goes with the bucket_size lookup table below */
+#define NUM_DISTINCT_SIZES_32_BIT 32
+
+/* this bucket_size lookup table is borrowed from GNU libstdc++ */
+static const uint32_t bucket_sizes[NUM_DISTINCT_SIZES_32_BIT] = {
+ /* 0 */ 5ul,
+ /* 1 */ 11ul,
+ /* 2 */ 23ul,
+ /* 3 */ 47ul,
+ /* 4 */ 97ul,
+ /* 5 */ 199ul,
+ /* 6 */ 409ul,
+ /* 7 */ 823ul,
+ /* 8 */ 1741ul,
+ /* 9 */ 3469ul,
+ /* 10 */ 6949ul,
+ /* 11 */ 14033ul,
+ /* 12 */ 28411ul,
+ /* 13 */ 57557ul,
+ /* 14 */ 116731ul,
+ /* 15 */ 236897ul,
+ /* 16 */ 480881ul,
+ /* 17 */ 976369ul,
+ /* 18 */ 1982627ul,
+ /* 19 */ 4026031ul,
+ /* 20 */ 8175383ul,
+ /* 21 */ 16601593ul,
+ /* 22 */ 33712729ul,
+ /* 23 */ 68460391ul,
+ /* 24 */ 139022417ul,
+ /* 25 */ 282312799ul,
+ /* 26 */ 573292817ul,
+ /* 27 */ 1164186217ul,
+ /* 28 */ 2364114217ul,
+ /* 29 */ 4294967291ul,
+ /* 30 */ 4294967291ul,
+ /* 31 */ 4294967291ul,
+};
+
struct dict_cmp {
dict_t *dict;
gf_boolean_t (*value_ignore) (char *k);
@@ -47,7 +86,7 @@ get_new_data ()
}
dict_t *
-get_new_dict_full (int size_hint)
+get_new_dict_full (uint32_t size_hint)
{
dict_t *dict = mem_get0 (THIS->ctx->dict_pool);
@@ -67,17 +106,8 @@ get_new_dict_full (int size_hint)
dict->members = &dict->members_internal;
}
else {
- /*
- * We actually need to allocate space for size_hint *pointers*
- * but we actually allocate space for one *structure*. Since
- * a data_pair_t consists of five pointers, we're wasting four
- * pointers' worth for N=1, and will overrun what we allocated
- * for N>5. If anybody ever starts using size_hint, we'll need
- * to fix this.
- */
- GF_ASSERT (size_hint <=
- (sizeof(data_pair_t) / sizeof(data_pair_t *)));
- dict->members = mem_get0 (THIS->ctx->dict_pair_pool);
+ dict->members = GF_CALLOC (size_hint, sizeof (data_pair_t *),
+ gf_common_mt_data_pair_t);
if (!dict->members) {
mem_put (dict);
return NULL;
@@ -108,6 +138,35 @@ dict_new (void)
return dict;
}
+dict_t *
+dict_new_by_size (uint32_t num)
+{
+ int32_t highest_bit = 0;
+ uint32_t bucket_size = 0;
+ dict_t *dict = NULL;
+
+ if (num == 0)
+ goto out;
+
+#ifdef _GNU_SOURCE
+ highest_bit = 32 - __builtin_clz (num);
+#else
+ while (num != 0) {
+ highest_bit++;
+ num >>= 1;
+ }
+#endif
+
+ bucket_size = bucket_sizes[highest_bit - 1];
+ dict = get_new_dict_full (bucket_size);
+
+ if (dict)
+ dict_ref (dict);
+
+out:
+ return dict;
+}
+
int32_t
is_data_equal (data_t *one,
data_t *two)
@@ -268,7 +327,7 @@ err_out:
static data_pair_t *
dict_lookup_common (dict_t *this, char *key)
{
- int hashval = 0;
+ uint32_t hashval = 0;
if (!this || !key) {
gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL,
LG_MSG_INVALID_ARG,
@@ -279,7 +338,7 @@ dict_lookup_common (dict_t *this, char *key)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1)
+ if (this->hash_size > 1)
hashval = SuperFastHash (key, strlen (key)) % this->hash_size;
data_pair_t *pair;
@@ -319,7 +378,7 @@ dict_lookup (dict_t *this, char *key, data_t **data)
static int32_t
dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)
{
- int hashval = 0;
+ uint32_t hashval = 0;
data_pair_t *pair;
char key_free = 0;
int tmp = 0;
@@ -336,7 +395,7 @@ dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1) {
+ if (this->hash_size > 1) {
tmp = SuperFastHash (key, strlen (key));
hashval = (tmp % this->hash_size);
}
@@ -478,7 +537,7 @@ dict_get (dict_t *this, char *key)
void
dict_del (dict_t *this, char *key)
{
- int hashval = 0;
+ uint32_t hashval = 0;
if (!this || !key) {
gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL,
@@ -491,7 +550,7 @@ dict_del (dict_t *this, char *key)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1)
+ if (this->hash_size > 1)
hashval = SuperFastHash (key, strlen (key)) % this->hash_size;
data_pair_t *pair = this->members[hashval];
diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h
index c5b82677e2e..1f6c1a0eae9 100644
--- a/libglusterfs/src/dict.h
+++ b/libglusterfs/src/dict.h
@@ -79,9 +79,9 @@ struct _data_pair {
struct _dict {
unsigned char is_static:1;
- int32_t hash_size;
- int32_t count;
- int32_t refcount;
+ uint32_t hash_size;
+ uint32_t count;
+ uint32_t refcount;
data_pair_t **members;
data_pair_t *members_list;
char *extra_free;
@@ -156,7 +156,7 @@ void *data_to_ptr (data_t *data);
data_t *get_new_data ();
data_t * data_copy (data_t *old);
-dict_t *get_new_dict_full (int size_hint);
+dict_t *get_new_dict_full (uint32_t size_hint);
dict_t *get_new_dict ();
int dict_foreach (dict_t *this,
@@ -196,6 +196,7 @@ int dict_keys_join (void *value, int size, dict_t *dict,
/* CLEANED UP FUNCTIONS DECLARATIONS */
GF_MUST_CHECK dict_t *dict_new (void);
+GF_MUST_CHECK dict_t *dict_new_by_size (uint32_t num);
dict_t *dict_copy_with_ref (dict_t *this, dict_t *new);
GF_MUST_CHECK int dict_reset (dict_t *dict);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 6e2d370605b..399d695665b 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -330,6 +330,7 @@ struct _cmd_args {
uint32_t log_buf_size;
uint32_t log_flush_timeout;
int32_t max_connect_attempts;
+ unsigned int connect_attempts;
char *print_exports;
char *print_netgroups;
/* advanced options */
diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c
index 17cd68fc206..fa3ac840c43 100644
--- a/libglusterfs/src/iobuf.c
+++ b/libglusterfs/src/iobuf.c
@@ -30,8 +30,8 @@ struct iobuf_init_config gf_iobuf_init_config[] = {
{8 * 1024, 128},
{32 * 1024, 64},
{128 * 1024, 32},
- {256 * 1024, 8},
- {1 * 1024 * 1024, 2},
+ {256 * 1024, 64},
+ {1 * 1024 * 1024, 64},
};
int
diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c
index 611615949fa..3399cc7c297 100644
--- a/libglusterfs/src/latency.c
+++ b/libglusterfs/src/latency.c
@@ -21,6 +21,7 @@
#include "statedump.h"
#include "libglusterfs-messages.h"
+static int gf_set_fop_from_fn_pointer_warning;
void
gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn)
{
@@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void
fop = GF_FOP_READDIRP;
else if (fops->getspec == *(fop_getspec_t *)&fn)
fop = GF_FOP_GETSPEC;
- else
- fop = -1;
+ else if (fops->ipc == *(fop_ipc_t *)&fn)
+ fop = GF_FOP_IPC;
+ else {
+ fop = GF_FOP_NULL;
+ GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning,
+ "latency",
+ GF_LOG_WARNING,
+ "Unknown FOP type");
+ }
frame->op = fop;
}
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
index 88fbdf58319..4d81ade8b60 100644
--- a/libglusterfs/src/mem-pool.c
+++ b/libglusterfs/src/mem-pool.c
@@ -454,6 +454,10 @@ mem_get0 (struct mem_pool *mem_pool)
void *
mem_get (struct mem_pool *mem_pool)
{
+#ifdef DISABLE_MEMPOOL
+ return GF_CALLOC (1, mem_pool->real_sizeof_type,
+ gf_common_mt_mem_pool);
+#else
struct list_head *list = NULL;
void *ptr = NULL;
int *in_use = NULL;
@@ -525,6 +529,7 @@ fwd_addr_out:
UNLOCK (&mem_pool->lock);
return ptr;
+#endif /* DISABLE_MEMPOOL */
}
@@ -551,6 +556,10 @@ __is_member (struct mem_pool *pool, void *ptr)
void
mem_put (void *ptr)
{
+#ifdef DISABLE_MEMPOOL
+ GF_FREE (ptr);
+ return;
+#else
struct list_head *list = NULL;
int *in_use = NULL;
void *head = NULL;
@@ -628,6 +637,7 @@ mem_put (void *ptr)
}
}
UNLOCK (&pool->lock);
+#endif /* DISABLE_MEMPOOL */
}
void
diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h
index afa52d8bc45..fc7bf9e5996 100644
--- a/libglusterfs/src/mem-types.h
+++ b/libglusterfs/src/mem-types.h
@@ -168,6 +168,7 @@ enum gf_common_mem_types_ {
/*lock migration*/
gf_common_mt_lock_mig,
gf_common_mt_pthread_t,
+ gf_common_ping_local_t,
gf_common_mt_end
};
#endif
diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c
index f7b2bea2f30..903303d1380 100644
--- a/libglusterfs/src/timespec.c
+++ b/libglusterfs/src/timespec.c
@@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta)
ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000);
ts->tv_sec += delta.tv_sec;
}
+
+void timespec_sub (const struct timespec *begin, const struct timespec *end,
+ struct timespec *res)
+{
+ if (end->tv_nsec < begin->tv_nsec) {
+ res->tv_sec = end->tv_sec - begin->tv_sec - 1;
+ res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec;
+ } else {
+ res->tv_sec = end->tv_sec - begin->tv_sec;
+ res->tv_nsec = end->tv_nsec - begin->tv_nsec;
+ }
+}
diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h
index f37194b97cf..9c393ee7166 100644
--- a/libglusterfs/src/timespec.h
+++ b/libglusterfs/src/timespec.h
@@ -20,5 +20,8 @@
void timespec_now (struct timespec *ts);
void timespec_adjust_delta (struct timespec *ts, struct timespec delta);
+void timespec_sub (const struct timespec *begin,
+ const struct timespec *end,
+ struct timespec *res);
#endif /* __INCLUDE_TIMESPEC_H__ */
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index 3c1cde50fa0..b2529d3c4f7 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -117,6 +117,14 @@ out:
}
+static const char *xlator_lib_path (void)
+{
+ const char *libdir_env = getenv ("GLUSTER_LIBDIR");
+
+ return libdir_env ? libdir_env : XLATORDIR;
+}
+
+
int
xlator_volopt_dynload (char *xlator_type, void **dl_handle,
volume_opt_list_t *opt_list)
@@ -130,9 +138,11 @@ xlator_volopt_dynload (char *xlator_type, void **dl_handle,
/* socket.so doesn't fall under the default xlator directory, hence we
* need this check */
if (!strstr(xlator_type, "rpc-transport"))
- ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xlator_type);
+ ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (),
+ xlator_type);
else
- ret = gf_asprintf (&name, "%s/%s.so", XLATORPARENTDIR, xlator_type);
+ ret = gf_asprintf (&name, "%s/../%s.so", xlator_lib_path (),
+ xlator_type);
if (-1 == ret) {
goto out;
}
@@ -183,7 +193,7 @@ xlator_dynload (xlator_t *xl)
INIT_LIST_HEAD (&xl->volume_options);
- ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xl->type);
+ ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), xl->type);
if (-1 == ret) {
goto out;
}
diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h
index 70e6f0a108d..2e04893c487 100644
--- a/libglusterfs/src/xlator.h
+++ b/libglusterfs/src/xlator.h
@@ -927,6 +927,7 @@ struct _xlator {
gf_loglevel_t loglevel; /* Log level for translator */
+ fop_latency_t client_latency;
/* for latency measurement */
fop_latency_t latencies[GF_FOP_MAXVALUE];
diff --git a/rfc.sh b/rfc.sh
index eb03843a173..2bcbc11bce4 100755
--- a/rfc.sh
+++ b/rfc.sh
@@ -17,7 +17,7 @@ done
shift $((OPTIND-1))
-branch="release-3.8";
+branch="release-3.8-fb";
set_hooks_commit_msg()
{
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c
index a7ff866ac99..7ce066dec5f 100644
--- a/rpc/rpc-lib/src/rpc-clnt-ping.c
+++ b/rpc/rpc-lib/src/rpc-clnt-ping.c
@@ -18,6 +18,7 @@
#include "mem-pool.h"
#include "xdr-rpc.h"
#include "rpc-common-xdr.h"
+#include "timespec.h"
char *clnt_ping_procs[GF_DUMP_MAXVALUE] = {
@@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = {
.procnames = clnt_ping_procs,
};
+struct ping_local {
+ struct rpc_clnt *rpc;
+ struct timespec submit_time;
+};
+
/* Must be called under conn->lock */
static int
__rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk)
@@ -166,16 +172,48 @@ out:
return;
}
+void
+_update_client_latency (const rpc_clnt_connection_t *conn,
+ call_frame_t *frame,
+ uint64_t elapsed_usec)
+{
+ fop_latency_t *lat;
+
+ lat = &frame->this->client_latency;
+
+ if (elapsed_usec < lat->min) {
+ lat->min = elapsed_usec;
+ }
+
+ if (elapsed_usec > lat->max) {
+ lat->max = elapsed_usec;
+ }
+
+ lat->total += elapsed_usec;
+ lat->count++;
+ lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count;
+ gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, "
+ "avg: %0.6lf ms, count:%ld",
+ conn->trans->peerinfo.identifier, elapsed_usec / 1000.0,
+ lat->mean / 1000.0, lat->count);
+}
+
int
rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
void *myframe)
{
- struct rpc_clnt *rpc = NULL;
+ struct ping_local *local = NULL;
xlator_t *this = NULL;
rpc_clnt_connection_t *conn = NULL;
+
call_frame_t *frame = NULL;
struct timespec timeout = {0, };
+ struct timespec now;
+ struct timespec delta;
+ int64_t latency_usec = 0;
+ int ret = 0;
int unref = 0;
+ gf_boolean_t call_notify = _gf_false;
if (!myframe) {
gf_log (THIS->name, GF_LOG_WARNING,
@@ -185,14 +223,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
frame = myframe;
this = frame->this;
- rpc = frame->local;
- frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */
- conn = &rpc->conn;
+ local = frame->local;
+ conn = &local->rpc->conn;
pthread_mutex_lock (&conn->lock);
{
if (req->rpc_status == -1) {
- unref = rpc_clnt_remove_ping_timer_locked (rpc);
+ unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
if (unref) {
gf_log (this->name, GF_LOG_WARNING,
"socket or ib related error");
@@ -207,8 +244,15 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
goto unlock;
}
- unref = rpc_clnt_remove_ping_timer_locked (rpc);
- if (__rpc_clnt_rearm_ping_timer (rpc,
+ timespec_now (&now);
+ timespec_sub (&local->submit_time, &now, &delta);
+ latency_usec = delta.tv_sec * 1000000UL +
+ delta.tv_nsec / 1000UL;
+
+ _update_client_latency (conn, frame, latency_usec);
+ call_notify = _gf_true;
+ unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
+ if (__rpc_clnt_rearm_ping_timer (local->rpc,
rpc_clnt_start_ping) == -1) {
gf_log (this->name, GF_LOG_WARNING,
"failed to set the ping timer");
@@ -217,12 +261,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
}
unlock:
pthread_mutex_unlock (&conn->lock);
+
+ if (call_notify) {
+ ret = local->rpc->notifyfn (local->rpc, this,
+ RPC_CLNT_PING, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "RPC_CLNT_PING notify failed");
+ }
+ }
out:
if (unref)
- rpc_clnt_unref (rpc);
+ rpc_clnt_unref (local->rpc);
- if (frame)
+ if (frame) {
+ GF_FREE (frame->local);
+ frame->local = NULL;
STACK_DESTROY (frame->root);
+ }
return 0;
}
@@ -232,18 +288,27 @@ rpc_clnt_ping (struct rpc_clnt *rpc)
call_frame_t *frame = NULL;
int32_t ret = -1;
rpc_clnt_connection_t *conn = NULL;
+ struct ping_local *local = NULL;
conn = &rpc->conn;
+ local = GF_MALLOC (sizeof(struct ping_local), gf_common_ping_local_t);
+ if (!local)
+ return ret;
frame = create_frame (THIS, THIS->ctx->pool);
- if (!frame)
+ if (!frame) {
+ GF_FREE (local);
return ret;
+ }
- frame->local = rpc;
+ local->rpc = rpc;
+ timespec_now (&local->submit_time);
+ frame->local = local;
ret = rpc_clnt_submit (rpc, &clnt_ping_prog,
GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0,
NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL);
if (ret) {
+ /* FIXME: should we free the frame here? Methinks so! */
gf_log (THIS->name, GF_LOG_ERROR,
"failed to start ping timer");
}
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index 3a5b287cd49..2ccaa56e4cb 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -19,6 +19,7 @@
typedef enum {
RPC_CLNT_CONNECT,
RPC_CLNT_DISCONNECT,
+ RPC_CLNT_PING,
RPC_CLNT_MSG,
RPC_CLNT_DESTROY
} rpc_clnt_event_t;
diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c
index e224dcc022e..5556740ca81 100644
--- a/rpc/rpc-lib/src/rpc-transport.c
+++ b/rpc/rpc-lib/src/rpc-transport.c
@@ -166,6 +166,19 @@ out:
+int rpc_transport_lib_path (char **name, char *type)
+{
+ int ret = -1;
+ char *libdir_env = getenv ("GLUSTER_LIBDIR");
+
+ ret = libdir_env == NULL
+ ? gf_asprintf (name, "%s/%s.so", RPC_TRANSPORTDIR, type)
+ : gf_asprintf (name, "%s/rpc-transport/%s.so", libdir_env, type);
+ return ret;
+}
+
+
+
rpc_transport_t *
rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)
{
@@ -274,7 +287,7 @@ rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)
goto fail;
}
- ret = gf_asprintf (&name, "%s/%s.so", RPC_TRANSPORTDIR, type);
+ ret = rpc_transport_lib_path (&name, type);
if (-1 == ret) {
goto fail;
}
@@ -652,18 +665,37 @@ out:
return ret;
}
+/** @brief build a dictionary containing basic transport options.
+ *
+ * @param[out] options: will be set to a newly created dictionary on success.
+ * @param[in] hostname: desired target hostname.
+ * @param[in] port: desired target port.
+ * @param[in] addr_family (optional): desired address family. If NULL,
+ * default will be used.
+ *
+ * @returns zero on success.
+ */
int
rpc_transport_inet_options_build (dict_t **options, const char *hostname,
- int port)
+ int port, const char *addr_family)
{
dict_t *dict = NULL;
char *host = NULL;
int ret = -1;
+#ifdef IPV6_DEFAULT
+ const char *addr_family_default = "inet6";
+#else
+ const char *addr_family_default = "inet";
+#endif
GF_ASSERT (options);
GF_ASSERT (hostname);
GF_ASSERT (port >= 1024);
+ if (!addr_family) {
+ addr_family = addr_family_default;
+ }
+
dict = dict_new ();
if (!dict)
goto out;
@@ -688,6 +720,14 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname,
goto out;
}
+ ret = dict_set_str (dict, "transport.address-family",
+ (char *)addr_family);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set address-family to %s", addr_family);
+ goto out;
+ }
+
ret = dict_set_str (dict, "transport-type", "socket");
if (ret) {
gf_log (THIS->name, GF_LOG_WARNING,
diff --git a/rpc/rpc-lib/src/rpc-transport.h b/rpc/rpc-lib/src/rpc-transport.h
index f0add065065..0f555462ea4 100644
--- a/rpc/rpc-lib/src/rpc-transport.h
+++ b/rpc/rpc-lib/src/rpc-transport.h
@@ -311,5 +311,6 @@ rpc_transport_unix_options_build (dict_t **options, char *filepath,
int frame_timeout);
int
-rpc_transport_inet_options_build (dict_t **options, const char *hostname, int port);
+rpc_transport_inet_options_build (dict_t **options, const char *hostname,
+ int port, const char *addr_family);
#endif /* __RPC_TRANSPORT_H__ */
diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c
index f07e745a4b3..9dc3bff427c 100644
--- a/rpc/rpc-lib/src/rpcsvc.c
+++ b/rpc/rpc-lib/src/rpcsvc.c
@@ -37,6 +37,10 @@
#include <stdarg.h>
#include <stdio.h>
+#ifdef IPV6_DEFAULT
+#include <netconfig.h>
+#endif
+
#include "xdr-rpcclnt.h"
#include "glusterfs-acl.h"
@@ -1363,6 +1367,82 @@ rpcsvc_error_reply (rpcsvc_request_t *req)
return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL);
}
+#ifdef IPV6_DEFAULT
+int
+rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port)
+{
+ const int IP_BUF_LEN = 64;
+ char addr_buf[IP_BUF_LEN];
+
+ int err = 0;
+ bool_t success = 0;
+ struct netconfig *nc;
+ struct netbuf *nb;
+
+ if (!newprog) {
+ goto out;
+ }
+
+ nc = getnetconfigent ("tcp6");
+ if (!nc) {
+ err = -1;
+ goto out;
+ }
+
+
+ err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff,
+ port & 0xff);
+ if (err < 0) {
+ err = -1;
+ goto out;
+ }
+
+ nb = uaddr2taddr (nc, addr_buf);
+ if (!nb) {
+ err = -1;
+ goto out;
+ }
+
+ success = rpcb_set (newprog->prognum, newprog->progver, nc, nb);
+ if (!success) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6"
+ " service with rpcbind");
+ }
+
+ err = 0;
+
+out:
+ return err;
+}
+
+int
+rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog)
+{
+ int err = 0;
+ bool_t success = 0;
+ struct netconfig *nc;
+
+ if (!newprog) {
+ goto out;
+ }
+
+ nc = getnetconfigent ("tcp6");
+ if (!nc) {
+ err = -1;
+ goto out;
+ }
+
+ success = rpcb_unset (newprog->prognum, newprog->progver, nc);
+ if (!success) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6"
+ " service with rpcbind");
+ }
+
+ err = 0;
+out:
+ return err;
+}
+#endif
/* Register the program with the local portmapper service. */
int
@@ -1527,7 +1607,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program)
" program failed");
goto out;
}
-
+#ifdef IPV6_DEFAULT
+ ret = rpcsvc_program_unregister_rpcbind6 (program);
+ if (ret == -1) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)"
+ " unregistration of program failed");
+ goto out;
+ }
+#endif
pthread_mutex_lock (&svc->rpclock);
{
list_for_each_entry (prog, &svc->programs, program) {
diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h
index 02e467e68a7..1032df03b0e 100644
--- a/rpc/rpc-lib/src/rpcsvc.h
+++ b/rpc/rpc-lib/src/rpcsvc.h
@@ -437,6 +437,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener);
extern int
rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port);
+#ifdef IPV6_DEFAULT
+extern int
+rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port);
+#endif
+
extern int
rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog);
diff --git a/rpc/rpc-transport/rdma/src/name.c b/rpc/rpc-transport/rdma/src/name.c
index 8003b1c87a0..b9d3269eb73 100644
--- a/rpc/rpc-transport/rdma/src/name.c
+++ b/rpc/rpc-transport/rdma/src/name.c
@@ -54,6 +54,10 @@ af_inet_bind_to_port_lt_ceiling (struct rdma_cm_id *cm_id,
struct sockaddr *sockaddr,
socklen_t sockaddr_len, uint32_t ceiling)
{
+#if defined(NO_PRIVPORT)
+ _assign_port(sockaddr, 0);
+ return rdma_bind_addr (cm_id, sockaddr);
+#else
int32_t ret = -1;
uint16_t port = ceiling - 1;
gf_boolean_t ports[GF_PORT_MAX];
@@ -100,6 +104,7 @@ loop:
}
return ret;
+#endif /* NO_PRIVPORT */
}
#if 0
diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c
index 0e34dc211fe..cab4161c076 100644
--- a/rpc/rpc-transport/socket/src/name.c
+++ b/rpc/rpc-transport/socket/src/name.c
@@ -42,6 +42,10 @@ static int32_t
af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr,
socklen_t sockaddr_len, uint32_t ceiling)
{
+#if defined(NO_PRIVPORT)
+ _assign_port(sockaddr, 0);
+ return bind (fd, sockaddr, sockaddr_len);
+#else
int32_t ret = -1;
uint16_t port = ceiling - 1;
gf_boolean_t ports[GF_PORT_MAX];
@@ -88,6 +92,7 @@ loop:
}
return ret;
+#endif /* NO_PRIVPORT */
}
static int32_t
@@ -557,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)
data_t *address_family_data = NULL;
int32_t ret = -1;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+ sa_family_t default_family = AF_INET6;
+#else
+ char *addr_family = "inet";
+ sa_family_t default_family = AF_INET;
+#endif
+
GF_VALIDATE_OR_GOTO ("socket", sa_family, out);
address_family_data = dict_get (this->options,
@@ -581,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)
}
} else {
gf_log (this->name, GF_LOG_DEBUG,
- "option address-family not specified, defaulting to inet");
- *sa_family = AF_INET;
+ "option address-family not specified, "
+ "defaulting to %s", addr_family);
+ *sa_family = default_family;
}
ret = 0;
diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c
index ae551dcfae7..8c1690f820c 100644
--- a/rpc/rpc-transport/socket/src/socket.c
+++ b/rpc/rpc-transport/socket/src/socket.c
@@ -866,7 +866,7 @@ __socket_keepalive (int fd, int family, int keepalive_intvl,
goto err;
}
#else
- if (family != AF_INET)
+ if (family != AF_INET && family != AF_INET6)
goto done;
ret = setsockopt (fd, IPPROTO_TCP, TCP_KEEPIDLE, &keepalive_idle,
@@ -3009,6 +3009,21 @@ socket_connect (rpc_transport_t *this, int port)
}
}
+ /* Make sure we are not vulnerable to someone setting
+ * net.ipv6.bindv6only to 1 so that gluster services are
+ * avalable over IPv4 & IPv6.
+ */
+ int disable_v6only = 0;
+
+ if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY,
+ (void *)&disable_v6only,
+ sizeof (disable_v6only)) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Error disabling sockopt IPV6_V6ONLY: \"%s\"",
+ strerror (errno));
+ }
+
+
if (priv->nodelay && (sa_family != AF_UNIX)) {
ret = __socket_nodelay (priv->sock);
diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x
index 8462dcc258a..5ec8109d828 100644
--- a/rpc/xdr/src/glusterfs-fops.x
+++ b/rpc/xdr/src/glusterfs-fops.x
@@ -84,6 +84,7 @@ enum glusterfs_event_t {
GF_EVENT_UPCALL,
GF_EVENT_SCRUB_STATUS,
GF_EVENT_SOME_CHILD_DOWN,
+ GF_EVENT_CHILD_PING,
GF_EVENT_MAXVAL
};
diff --git a/run-tests.sh b/run-tests.sh
index 1487f30d832..866ab0464b4 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -182,12 +182,14 @@ function get_test_status ()
# for later. Why does the key have the distro and version then?
# Because changing the key in all test files would be very big process
# updating just this function with a better logic much simpler.
+ #
+ # FB Edit: For FB tests we are disabling NetBSD testing.
+ #
Linux)
result=$(grep -e "^#G_TESTDEF_TEST_STATUS_CENTOS6" $test_name | \
awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;;
NetBSD)
- result=$(grep -e "^#G_TESTDEF_TEST_STATUS_NETBSD7" $test_name | \
- awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;;
+ result="KNOWN_ISSUE" ;;
*)
result="ENABLED" ;;
esac
diff --git a/tests/basic/accept-v6v4.t b/tests/basic/accept-v6v4.t
new file mode 100644
index 00000000000..7128c12c6be
--- /dev/null
+++ b/tests/basic/accept-v6v4.t
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+. $(dirname $0)/../nfs.rc
+
+#
+# This test ensures that GlusterFS provides NFS, Mount and its Management daemon
+# over both IPv4 and IPv6. It uses netcat to check the services running on both
+# IPv4 & IPv6 addresses as well as a mount to test that mount & nfs work.
+#
+
+IPV4_SUPPORT=false
+IPV6_SUPPORT=false
+
+host $HOSTNAME | grep -q "has address" && IPV4_SUPPORT=true
+host $HOSTNAME | grep -q "has IPv6 address" && IPV6_SUPPORT=true
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+mkdir -p $B0/b{0,1,2}
+
+# make sure no registered rpcbind services are running
+service rpcbind restart
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI vol create $V0 replica 3 $H0:$B0/b0 $H0:$B0/b1 $H0:$B0/b2
+
+TEST $CLI vol set $V0 cluster.self-heal-daemon off
+TEST $CLI vol set $V0 nfs.disable off
+TEST $CLI vol set $V0 cluster.choose-local off
+TEST $CLI vol start $V0
+
+MOUNTD_PORT=38465
+MGMTD_PORT=24007
+NFSD_PORT=2049
+
+function check_ip_port {
+ ip=$1
+ port=$2
+ type=$3
+
+ nc_flags=""
+ if [ "$type" == "v6" ] && [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ else
+ nc_flags="-6"
+ fi
+
+ if [ "$type" == "v4" ] && [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ fi
+
+ if exec 3<>/dev/tcp/$ip/$port; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+function check_nfs {
+ ip=$1
+ type=$2
+
+ if [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ fi
+
+ if [ "$type" == "v6" ]; then
+ addr="[$ip]"
+ else
+ addr="$ip"
+ fi
+
+ if mount_nfs $addr:/$V0 $N0; then
+ umount_nfs $N0
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+if [ ! $IPV4_SUPPORT ] && [ ! $IPV6_SUPPORT ]; then
+ exit 1
+fi
+
+# Get the V4 & V6 addresses of this host
+if $IPV4_SUPPORT; then
+ V4=$(host $HOSTNAME | head -n1 | awk -F ' ' '{print $4}')
+else
+ V4="NONE"
+fi
+
+if $IPV6_SUPPORT; then
+ V6=$(host $HOSTNAME | tail -n1 | awk -F ' ' '{print $5}')
+else
+ V6="NONE"
+fi
+
+# First check the management daemon
+EXPECT "Y" check_ip_port $V6 $MGMTD_PORT "v6"
+EXPECT "Y" check_ip_port $V4 $MGMTD_PORT "v4"
+
+# Give the MOUNT/NFS Daemon some time to start up
+sleep 4
+
+EXPECT "Y" check_ip_port $V4 $MOUNTD_PORT "v6"
+EXPECT "Y" check_ip_port $V6 $MOUNTD_PORT "v4"
+
+EXPECT "Y" check_ip_port $V4 $NFSD_PORT "v6"
+EXPECT "Y" check_ip_port $V6 $NFSD_PORT "v4"
+
+# Mount the file system
+EXPECT "Y" check_nfs $V6 "v6"
+EXPECT "Y" check_nfs $V4 "v4"
+
+cleanup;
diff --git a/tests/basic/dht-min-free-space.t b/tests/basic/dht-min-free-space.t
new file mode 100755
index 00000000000..17d10cc39a5
--- /dev/null
+++ b/tests/basic/dht-min-free-space.t
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+grep $B0/patchy1 /proc/mounts &> /dev/null && umount $B0/patchy1
+grep $B0/patchy2 /proc/mounts &> /dev/null && umount $B0/patchy2
+losetup -d /dev/loop0 2> /dev/null
+losetup -d /dev/loop1 2> /dev/null
+mkdir $B0/${V0}{1..2}
+
+TEST glusterd
+
+TEST dd if=/dev/zero of=/tmp/${V0}-dev1 bs=1M count=30
+TEST dd if=/dev/zero of=/tmp/${V0}-dev2 bs=1M count=30
+
+TEST losetup /dev/loop0 /tmp/${V0}-dev1
+TEST losetup /dev/loop1 /tmp/${V0}-dev2
+
+TEST mkfs.xfs /dev/loop0
+TEST mkfs.xfs /dev/loop1
+
+TEST mount /dev/loop0 $B0/${V0}1
+TEST mount /dev/loop1 $B0/${V0}2
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}2
+TEST $CLI volume set $V0 cluster.min-free-disk 2MB
+TEST $CLI volume set $V0 cluster.min-free-strict-mode on
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 0
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+####################################
+# Test re-directs of file creation #
+####################################
+
+# This should work, no redirects
+TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=8
+TEST [ -f /d/backends/${V0}2/testfile1 ] && [ ! -k /d/backends/${V0}1/testfile1 ]
+
+TEST $CLI volume set $V0 cluster.min-free-disk 19MB
+
+# This should work, & the file redirected
+# Subvolume 2 should have the linkto &
+# Subvolume 1 should have the original
+TEST dd if=/dev/zero of=$M0/testfile3 bs=1M count=4
+TEST [ -f /d/backends/${V0}1/testfile3 ] && [ ! -k /d/backends/${V0}1/testfile3 ]
+TEST [ -k /d/backends/${V0}2/testfile3 ]
+
+# This should fail, cluster is full
+TEST ! dd if=/dev/zero of=$M0/testfile2 bs=1M count=23
+
+###################
+# Strict mode off #
+###################
+TEST $CLI volume set $V0 cluster.min-free-strict-mode off
+TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=20
+TEST rm -f $M0/testfile1
+
+###################
+# Strict mode on #
+###################
+TEST $CLI volume set $V0 cluster.min-free-strict-mode on
+TEST ! dd if=/dev/zero of=$M0/testfile1 bs=1M count=16
+TEST rm -f $M0/testfile1
+
+killall gluster{fs,fsd,d}
+
+umount -lf $B0/${V0}1
+umount -lf $B0/${V0}2
+
+losetup -d /dev/loop0
+losetup -d /dev/loop1
+
+cleanup;
diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common
index 83c4463a912..152e3b51236 100644
--- a/tests/basic/ec/ec-common
+++ b/tests/basic/ec/ec-common
@@ -45,7 +45,7 @@ for size in $SIZE_LIST; do
eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }')
done
-TEST df -h
+TEST df -h $M0
TEST stat $M0
for idx in `seq 0 $LAST_BRICK`; do
diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t
index 98dd9232c73..3e3467535fb 100644
--- a/tests/basic/ec/self-heal.t
+++ b/tests/basic/ec/self-heal.t
@@ -136,7 +136,7 @@ TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024
cs=$(sha1sum $tmp/test | awk '{ print $1 }')
-TEST df -h
+TEST df -h $M0
TEST stat $M0
for idx in {0..5}; do
diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t
index fdaf9c2822e..da88bbcb2cc 100644
--- a/tests/basic/exports_parsing.t
+++ b/tests/basic/exports_parsing.t
@@ -32,7 +32,20 @@ function test_bad_opt ()
glusterfsd --print-exports $1 2>&1 | sed -n 1p
}
-EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports
+function check_export_line() {
+ if [ "$1" == "$2" ]; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+ return
+}
+
+export_result=$(test_good_file $EXP_FILES/exports)
+EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result"
+
+export_result=$(test_good_file $EXP_FILES/exports-v6)
+EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result"
EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports
EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports
diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t
index cea8aa737c0..713c7e27579 100644
--- a/tests/basic/fop-sampling.t
+++ b/tests/basic/fop-sampling.t
@@ -2,13 +2,27 @@
#
. $(dirname $0)/../include.rc
+. $(dirname $0)/../nfs.rc
. $(dirname $0)/../volume.rc
-SAMPLE_FILE="$(gluster --print-logdir)/samples/glusterfs_${V0}.samp"
+BRICK_SAMPLES="$(gluster --print-logdir)/samples/glusterfsd__d_backends_${V0}0.samp"
+NFS_SAMPLES="$(gluster --print-logdir)/samples/glusterfs_nfsd.samp"
+
+function check_path {
+ op=$1
+ path=$2
+ file=$3
+ grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null
+ if [ $? -eq 0 ]; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
function print_cnt() {
local FOP_TYPE=$1
- local FOP_CNT=$(grep ,${FOP_TYPE} ${SAMPLE_FILE} | wc -l)
+ local FOP_CNT=$(grep ,${FOP_TYPE} ${BRICK_SAMPLES} | wc -l)
echo $FOP_CNT
}
@@ -42,12 +56,18 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume set $V0 nfs.disable off
TEST $CLI volume set $V0 diagnostics.latency-measurement on
TEST $CLI volume set $V0 diagnostics.count-fop-hits on
-TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2
+TEST $CLI volume set $V0 diagnostics.stats-dump-interval 5
TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535
TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1
TEST $CLI volume set $V0 diagnostics.stats-dnscache-ttl-sec 3600
-
TEST $CLI volume start $V0
+
+>${NFS_SAMPLES}
+>${BRICK_SAMPLES}
+
+#################
+# Basic Samples #
+#################
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
for i in {1..5}
@@ -58,4 +78,52 @@ done
TEST ls -l $M0
EXPECT_WITHIN 6 "OK" check_samples
-cleanup
+sleep 2
+
+################################
+# Paths in the samples #
+################################
+
+TEST mount_nfs $H0:$V0 $N0
+
+ls $N0 &> /dev/null
+touch $N0/file1
+stat $N0/file1 &> /dev/null
+echo "some data" > $N0/file1
+dd if=/dev/zero of=$N0/file2 bs=1M count=10 conv=fsync
+dd if=/dev/zero of=$N0/file1 bs=1M count=1
+cat $N0/file2 &> /dev/null
+mkdir -p $N0/dir1
+rmdir $N0/dir1
+rm $N0/file1
+rm $N0/file2
+
+EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FINODELK /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ENTRYLK / $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $BRICK_SAMPLES
+
+
+EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ACCESS /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ACCESS /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path READ /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $NFS_SAMPLES
+
+cleanup;
diff --git a/tests/basic/fops-sanity-gfproxy.t b/tests/basic/fops-sanity-gfproxy.t
new file mode 100755
index 00000000000..b3bb8a502cc
--- /dev/null
+++ b/tests/basic/fops-sanity-gfproxy.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 $H0:$B0/brick1;
+EXPECT 'Created' volinfo_field $V0 'Status';
+
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+#gfproxy server
+TEST glusterfs --volfile-id=gfproxy/$V0 --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log
+
+#mount on a random dir
+TEST glusterfs --entry-timeout=3600 --attribute-timeout=3600 -s $H0 --volfile-id=gfproxy-client/$V0 $M0 --direct-io-mode=yes
+TEST grep gfproxy-client /proc/mounts
+
+build_tester $(dirname $0)/fops-sanity.c
+
+TEST cp $(dirname $0)/fops-sanity $M0
+cd $M0
+TEST ./fops-sanity $V0
+cd -
+rm -f $(dirname $0)/fops-sanity
+
+cleanup;
diff --git a/tests/basic/gfproxy.t b/tests/basic/gfproxy.t
new file mode 100644
index 00000000000..71c6788db76
--- /dev/null
+++ b/tests/basic/gfproxy.t
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../nfs.rc
+
+cleanup;
+
+function start_gfproxyd {
+ glusterfs --volfile-id=gfproxy/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log
+}
+
+function restart_gfproxyd {
+ pkill -f gfproxy/${V0}
+ start_gfproxyd
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 config.gfproxyd-remote-host $H0
+TEST $CLI volume start $V0
+
+sleep 2
+
+REGULAR_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-fuse.vol"
+GFPROXY_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-gfproxy-fuse.vol"
+GFPROXYD_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.gfproxyd.vol"
+
+# Client volfile must exist
+TEST [ -f $GFPROXY_CLIENT_VOLFILE ]
+
+# AHA & write-behind translators must exist
+TEST grep "cluster/aha" $GFPROXY_CLIENT_VOLFILE
+TEST grep "performance/write-behind" $GFPROXY_CLIENT_VOLFILE
+
+# Make sure we didn't screw up the existing client
+TEST grep "performance/write-behind" $REGULAR_CLIENT_VOLFILE
+TEST grep "cluster/replicate" $REGULAR_CLIENT_VOLFILE
+TEST grep "cluster/distribute" $REGULAR_CLIENT_VOLFILE
+
+TEST [ -f $GFPROXYD_VOLFILE ]
+
+TEST grep "cluster/replicate" $GFPROXYD_VOLFILE
+TEST grep "cluster/distribute" $GFPROXYD_VOLFILE
+
+# AHA & write-behind must *not* exist
+TEST ! grep "cluster/aha" $GFPROXYD_VOLFILE
+TEST ! grep "performance/write-behind" $GFPROXYD_VOLFILE
+
+# Test that we can start the server and the client
+TEST start_gfproxyd
+TEST glusterfs --volfile-id=gfproxy-client/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy-client.log $M0
+sleep 2
+TEST grep gfproxy-client/${V0} /proc/mounts
+
+# Write data to the mount and checksum it
+TEST dd if=/dev/urandom bs=1M count=10 of=/tmp/testfile1
+md5=$(md5sum /tmp/testfile1 | awk '{print $1}')
+TEST cp -v /tmp/testfile1 $M0/testfile1
+TEST [ "$(md5sum $M0/testfile1 | awk '{print $1}')" == "$md5" ]
+
+rm /tmp/testfile1
+
+dd if=/dev/zero of=$N0/bigfile bs=1M count=3072 &
+BG_STRESS_PID=$!
+
+sleep 3
+
+restart_gfproxyd
+
+TEST wait $BG_STRESS_PID
+
+cleanup;
diff --git a/tests/basic/glusterd/volfile_server_switch.t b/tests/basic/glusterd/volfile_server_switch.t
index 0b0e6470244..0b01398215c 100644
--- a/tests/basic/glusterd/volfile_server_switch.t
+++ b/tests/basic/glusterd/volfile_server_switch.t
@@ -1,5 +1,8 @@
#!/bin/bash
+#G_TESTDEF_TEST_STATUS_CENTOS6=KNOWN_ISSUE,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=KNOWN_ISSUE,BUG=000000
+
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../cluster.rc
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
new file mode 100644
index 00000000000..f3655eaef3b
--- /dev/null
+++ b/tests/basic/halo-failover-disabled.t
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+# brick immediatelly, and md5s will show they are equal once
+# the write completes.
+# 4. The mount should also be RW after the brick is killed as
+# quorum will be immediately restored by swapping in the
+# other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.halo-failover-enabled off
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+
+# Use a large ping time here so the spare brick is not marked up
+# based on the ping time. The only way it can get marked up is
+# by being swapped in via the down event (which is what we are disabling).
+TEST $CLI volume set $V0 network.ping-timeout 1000
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
+
+# Write some data to the mount
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+
+UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX}
+
+# Make sure two children are down and one is up.
+EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3
+
+# Test that quorum should fail and the mount is RO, the reason here
+# is that although there _is_ another brick running which _could_
+# take the failed bricks place, it is not marked "up" so quorum
+# will not be fullfilled. If we waited 1000 second the brick would
+# indeed be activated based on ping time, but for our test we want
+# the decision to be solely "down event" driven, not ping driven.
+TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX
+
+# Test that quorum should be restored and the file is writable
+TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+
+cleanup
diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t
new file mode 100644
index 00000000000..2dddf9951fa
--- /dev/null
+++ b/tests/basic/halo-failover-enabled.t
@@ -0,0 +1,87 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+# brick immediatelly, and md5s will show they are equal once
+# the write completes.
+# 4. The mount should also be RW after the brick is killed as
+# quorum will be immediately restored by swapping in the
+# other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-failover-enabled on
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 network.ping-timeout 20
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
+
+# Write some data to the mount
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+
+KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST [ -n "$KILL_IDX" ]
+# NB: UP_CHILDREN is the set of children that should be up after we kill
+# the brick indicated by KILL_IDX, *not* the set of children which are
+# currently up!
+UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g"))
+UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)"
+UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)"
+VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)"
+
+# The victim brick should have a copy of the file.
+TEST [ -n "$VICTIM_HAS_TEST" ]
+
+# Of the bricks which will remain standing, there should be only one
+# brick which has the file called test. If the both have the first
+# test file, the test is invalid as all the bricks are up and the
+# halo-max-replicas is not being honored; e.g. bug exists.
+ONLY_ONE=$((([ -z "$UP2_HAS_TEST" ] || [ -z "$UP1_HAS_TEST" ]) &&
+ ([ -n "$UP2_HAS_TEST" ] || [ -n "$UP1_HAS_TEST" ])) && echo true)
+TEST [ "x$ONLY_ONE" == "xtrue" ]
+
+echo "Failing child ${KILL_IDX}..."
+TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX}
+
+# Test the mount is still RW (i.e. quorum works)
+TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync
+
+# Calulate the MD5s
+MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1)
+MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1)
+
+# Verify the two up bricks have identical MD5s, if both are identical
+# then we must have successfully failed-over to the brick which was
+# previously proven to be down (via the ONLY_ONE test).
+TEST [ "$MD5_UP1" == "$MD5_UP2" ]
+
+cleanup
diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t
new file mode 100644
index 00000000000..4574fdfe41e
--- /dev/null
+++ b/tests/basic/halo-hybrid.t
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test for the Halo hybrid feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+# heal daemon is off to start.
+# 2. Write some data
+# 3. Verify hybrid code chose children for lookups
+# 4. Verify hybrid code chose child for reads
+# 5. Verify hybrid code wrote synchronously to all replicas
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function found_fuse_log_msg {
+ local dir="$1"
+ local msg="$2"
+ local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l)
+ if (( $cnt == 1 )); then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-hybrid-mode True
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level TRACE
+TEST $CLI volume start $V0
+
+# Start a synchronous mount
+TEST glusterfs --volfile-id=/$V0 \
+ --xlator-option *replicate*.halo-max-latency=9999 \
+ --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+sleep 2
+cd $M0
+
+TEST mkdir testdir
+TEST cd testdir
+for i in {1..5}
+do
+ dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+TEST ls -l
+
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs"
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child"
+
+B0_CNT=$(ls $B0/${V0}0/testdir | wc -l)
+B1_CNT=$(ls $B0/${V0}1/testdir | wc -l)
+B2_CNT=$(ls $B0/${V0}2/testdir | wc -l)
+
+# Writes should be synchronous, all should have same
+# file count
+TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))"
+
+cleanup
diff --git a/tests/basic/halo.t b/tests/basic/halo.t
new file mode 100644
index 00000000000..25aca3442ab
--- /dev/null
+++ b/tests/basic/halo.t
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Test for the Halo geo-replication feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+# heal daemon is off to start.
+# 2. Write some data
+# 3. Verify at least one of the bricks did not receive the writes.
+# 4. Turn the heal daemon on
+# 5. Within 30 seconds the SHD should async heal the data over
+# to the 3rd brick.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+
+for i in {1..5}
+do
+ dd if=/dev/urandom of=f bs=1M count=1 2>/dev/null
+ mkdir a; cd a;
+done
+
+B0_CNT=$(ls $B0/${V0}0 | wc -l)
+B1_CNT=$(ls $B0/${V0}1 | wc -l)
+B2_CNT=$(ls $B0/${V0}2 | wc -l)
+
+# One of the brick dirs should be empty
+TEST "(($B0_CNT == 0 || $B1_CNT == 0 || $B2_CNT == 0))"
+
+# Ok, turn the heal daemon on and verify it heals it up
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+cleanup
diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t
index 9df5cb45c3b..cd0189788ba 100755
--- a/tests/basic/mount-nfs-auth.t
+++ b/tests/basic/mount-nfs-auth.t
@@ -15,6 +15,9 @@ TEST glusterd
TEST pidof glusterd
TEST $CLI volume info
+H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1)
+H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}')
+
# Export variables for allow & deny
EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
@@ -37,6 +40,10 @@ function build_dirs () {
mkdir -p $B0/b{0,1,2}/L1/L2/L3
}
+function export_allow_this_host_ipv6 () {
+ printf "$EXPORT_ALLOW6\n" > /var/lib/glusterd/nfs/exports
+}
+
function export_allow_this_host () {
printf "$EXPORT_ALLOW\n" > ${NFSDIR}/exports
}
@@ -186,6 +193,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available
## Mount NFS
EXPECT "Y" check_mount_success $V0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+
+## Mount NFS using the IPv6 export
+export_allow_this_host_ipv6
+EXPECT "Y" check_mount_success $V0
## Disallow host
TEST export_deny_this_host
diff --git a/tests/basic/write-behind.t b/tests/basic/write-behind.t
new file mode 100644
index 00000000000..edad59786af
--- /dev/null
+++ b/tests/basic/write-behind.t
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function clear_stats {
+ > /var/lib/glusterfs/stats/glusterfs_d_backends_${V0}0.dump
+}
+
+function got_expected_write_count {
+ expected_size=$1
+ expected_value=$2
+ grep aggr.write_${expected_size} "/var/lib/glusterd/stats/glusterfsd__d_backends_${V0}0.dump" | grep $expected_value
+ if [ $? == 0 ]; then
+ echo "Y";
+ else
+ echo "N";
+ fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+
+# These are needed for our tracking of write sizes
+TEST $CLI volume set $V0 diagnostics.latency-measurement on
+TEST $CLI volume set $V0 diagnostics.count-fop-hits on
+TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2
+
+# Disable this in testing to get deterministic results
+TEST $CLI volume set $V0 performance.write-behind-trickling-writes off
+
+TEST $CLI volume start $V0
+
+sleep 2;
+
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
+
+# Write a 100MB file with a window-size 1MB, we should get 100 writes of 1MB each
+TEST dd if=/dev/zero of=$M0/100mb_file bs=1M count=100
+EXPECT_WITHIN 5 "Y" got_expected_write_count "1mb" 100
+
+TEST $CLI volume set $V0 performance.write-behind-window-size 512KB
+
+# Write a 100MB file with a window-size 512KB, we should get 200 writes of 512KB each
+TEST dd if=/dev/zero of=$M0/100mb_file_2 bs=1M count=100
+EXPECT_WITHIN 5 "Y" got_expected_write_count "512kb" 200
+
+cleanup;
diff --git a/tests/bugs/distribute/bug-1099890.t b/tests/bugs/distribute/bug-1099890.t
index 40f70d4938b..29ceccf2309 100644
--- a/tests/bugs/distribute/bug-1099890.t
+++ b/tests/bugs/distribute/bug-1099890.t
@@ -44,6 +44,8 @@ TEST $CLI volume set $V0 features.quota-deem-statfs on
TEST $CLI volume quota $V0 limit-usage / 150MB;
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1
+
TEST $CLI volume set $V0 cluster.min-free-disk 50%
TEST glusterfs -s $H0 --volfile-id=$V0 $M0
diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t
index c5a7f041ac8..8cf905a8f0b 100755
--- a/tests/bugs/distribute/bug-1161311.t
+++ b/tests/bugs/distribute/bug-1161311.t
@@ -53,8 +53,14 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0;
TEST mkdir $M0/dir1
TEST mkdir -p $M0/dir2/dir3
-# Create a large file (1GB), so that rebalance takes time
-dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240
+# Create a large file (6.4 GB), so that rebalance takes time
+# Reading from /dev/urandom is slow, so we'll cat it together
+dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240
+for i in {1..10}; do
+ cat /tmp/FILE2 >> $M0/dir1/FILE2
+done
+
+#dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240
# Rename the file to create a linkto, for rebalance to
# act on the file
diff --git a/tests/bugs/fuse/bug-858488-min-free-disk.t b/tests/bugs/fuse/bug-858488-min-free-disk.t
index 635dc04d1e6..ab636575d3f 100644
--- a/tests/bugs/fuse/bug-858488-min-free-disk.t
+++ b/tests/bugs/fuse/bug-858488-min-free-disk.t
@@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2
## Lets create volume
TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1
## Verify volume is created
EXPECT "$V0" volinfo_field $V0 'Volume Name';
diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
index 9fc7ac3b845..3bc80ab9dab 100644
--- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
+++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
@@ -1,6 +1,6 @@
#!/bin/bash
-## Test case for cluster.min-free-disk option validation.
+## Test case for cluster.cluster.min-free-disk option validation.
. $(dirname $0)/../../include.rc
@@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2
TEST $CLI volume start $V0
## Setting invalid value for option cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk 143.!/12
-TEST ! $CLI volume set $V0 min-free-disk 123%
-TEST ! $CLI volume set $V0 min-free-disk 194.34%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12
+TEST ! $CLI volume set $V0 cluster.min-free-disk 123%
+TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34%
## Setting fractional value as a size (unit is byte) for option
## cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk 199.051
-TEST ! $CLI volume set $V0 min-free-disk 111.999
+TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051
+TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999
## Setting valid value for option cluster.min-free-disk should pass
-TEST $CLI volume set $V0 min-free-disk 12%
-TEST $CLI volume set $V0 min-free-disk 56.7%
-TEST $CLI volume set $V0 min-free-disk 120
-TEST $CLI volume set $V0 min-free-disk 369.0000
+TEST $CLI volume set $V0 cluster.min-free-disk 12%
+TEST $CLI volume set $V0 cluster.min-free-disk 56.7%
+TEST $CLI volume set $V0 cluster.min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 369.0000
cleanup;
diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t
index c30d2b852d4..1b9ca18c08a 100755
--- a/tests/bugs/glusterd/bug-859927.t
+++ b/tests/bugs/glusterd/bug-859927.t
@@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " "
TEST $CLI volume set $V0 min-free-inodes 60%
EXPECT "60%" volume_option $V0 cluster.min-free-inodes
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk " "
-TEST $CLI volume set $V0 min-free-disk 60%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk " "
+TEST $CLI volume set $V0 cluster.min-free-disk 60%
EXPECT "60%" volume_option $V0 cluster.min-free-disk
-TEST $CLI volume set $V0 min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 120
EXPECT "120" volume_option $V0 cluster.min-free-disk
TEST ! $CLI volume set $V0 frame-timeout ""
diff --git a/tests/cluster.rc b/tests/cluster.rc
index 467bbcb06e1..42547f09e37 100644
--- a/tests/cluster.rc
+++ b/tests/cluster.rc
@@ -46,17 +46,18 @@ function define_glusterds() {
bopt="management.transport.socket.bind-address=${!h}";
popt="--pid-file=${!b}/glusterd.pid";
sopt="management.glusterd-sockfile=${!b}/glusterd/gd.sock"
+ aopt="*.transport.address-family=inet"
#Get the logdir
logdir=`gluster --print-logdir`
#Fetch the testcases name and prefix the glusterd log with it
logfile=`echo ${0##*/}`_glusterd$i.log
lopt="--log-file=$logdir/$logfile"
if [ "$2" == "-LDEBUG" ]; then
- eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
- eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
+ eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
+ eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
else
- eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
- eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
+ eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
+ eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
fi
done
}
diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6
new file mode 100644
index 00000000000..426b1ef5705
--- /dev/null
+++ b/tests/configfiles/exports-v6
@@ -0,0 +1 @@
+/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,)
diff --git a/tests/env.rc.in b/tests/env.rc.in
index 82971c4a8de..87befc3711d 100644
--- a/tests/env.rc.in
+++ b/tests/env.rc.in
@@ -28,3 +28,6 @@ export PYTHON
PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH
export PYTHONPATH
+
+TESTER_CFLAGS="@TESTER_CFLAGS@"
+export TESTER_CFLAGS
diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t
new file mode 100755
index 00000000000..4372998681f
--- /dev/null
+++ b/tests/features/brick-min-free-space.t
@@ -0,0 +1,113 @@
+#!/bin/bash
+#
+# Test storage.min-free-disk option works.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST truncate -s 16M $B0/brick0
+TEST LOOPDEV=$(losetup --find --show $B0/brick0)
+TEST mkfs.xfs $LOOPDEV
+
+mkdir -p $B0/$V0
+
+TEST mount -t xfs $LOOPDEV $B0/$V0
+
+###########
+# AIO on #
+###########
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio on
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+############
+# AIO off #
+############
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup;
diff --git a/tests/features/lock_revocation.t b/tests/features/lock_revocation.t
new file mode 100644
index 00000000000..cbf21b71650
--- /dev/null
+++ b/tests/features/lock_revocation.t
@@ -0,0 +1,52 @@
+#!/bin/bash
+logdir=$(gluster --print-logdir)
+BRICK_LOGFILES="$logdir/bricks/d-backends-brick?.log"
+rm -f $BRICK_LOGFILES &> /dev/null
+
+# Test that lock revocation works
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+cleanup;
+
+function deadlock_fop() {
+ local MNT=$1
+ for i in {1..1000}; do
+ dd if=/dev/zero of=$MNT/testfile bs=1k count=10 &> /dev/null
+ if grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null; then
+ break
+ fi
+ done
+}
+
+function monkey_unlock() {
+ grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null && echo SUCCESS
+ return 0
+}
+
+function append_to_file() {
+ local FILE_PATH=$1
+ echo "hello" >> $FILE_PATH
+ return 0
+}
+
+#Init
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 features.locks-monkey-unlocking on
+TEST $CLI volume set $V0 features.locks-revocation-secs 2
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 -s $H0 $M0;
+TEST $GFS --volfile-id=$V0 -s $H0 $M1;
+
+# Deadlock writes to a file using monkey unlocking
+deadlock_fop $M0 &
+EXPECT_WITHIN 60 "SUCCESS" monkey_unlock
+
+# Sleep > unlock timeout and attempt to write to the file
+sleep 3
+TEST append_to_file $M1/testfile
+
+cleanup
diff --git a/tests/halo.rc b/tests/halo.rc
new file mode 100644
index 00000000000..4cb7c81da85
--- /dev/null
+++ b/tests/halo.rc
@@ -0,0 +1,52 @@
+# Return the current Halo state of a given child (by index, i.e. 0
+# is first child).
+function halo_child_state {
+ grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG |
+ tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//'
+}
+
+# Return number of Halo children which are in a given state.
+# First parameter is total # children.
+# Second parameter is state to match (e.g. "UP").
+function halo_children_in_state {
+ local CHILD_COUNT=$1
+ local SUM=0
+ for CHILD in $(seq 0 $((CHILD_COUNT-1))); do
+ if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then
+ SUM=$((SUM+1))
+ fi
+ done
+ echo $SUM
+}
+
+# Return number of up halo children,
+# First parameter is total # children,
+function halo_children_up {
+ echo $(halo_children_in_state $1 "UP")
+}
+
+# Return number of down halo children,
+# First parameter is total # children,
+function halo_children_down {
+ echo $(halo_children_in_state $1 "DOWN")
+}
+
+# Return number of up & down halo children.
+# First parameter is total number of children.
+function halo_sum_child_states {
+ local CHILD_COUNT=$1
+
+ local UP=0
+ local DOWN=0
+
+ for CHILD in $(seq 0 $((CHILD_COUNT-1))); do
+ local STATE=$(halo_child_state $CHILD)
+ if [ x"$STATE" == x"UP" ]; then
+ UP=$((UP+1))
+ elif [ x"$STATE" == x"DOWN" ]; then
+ DOWN=$((DOWN+1))
+ fi
+ done
+
+ echo "$UP $DOWN"
+}
diff --git a/tests/include.rc b/tests/include.rc
index 492e35a7b6c..9f32e88f5f5 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -19,6 +19,8 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g
CC=cc
OSTYPE=$(uname -s)
+M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point
+
ENV_RC=$(dirname $0)/../env.rc
if [ ! -f $ENV_RC ]; then
ENV_RC=$(dirname $0)/../../env.rc
@@ -612,6 +614,7 @@ function build_tester ()
then
cflags="$cflags $(pkg-config glusterfs-api --cflags-only-I --libs-only-L)"
fi
+ cflags="$cflags ${TESTER_CFLAGS}"
$CC -g -o $(dirname $cfile)/$execname $cfile $cflags
}
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 903fbb39f12..bce94bb8b3b 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht ec
+SUBDIRS = aha stripe afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index f4d1d63c3d5..747577c9380 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -43,6 +43,13 @@
#include "afr-self-heald.h"
#include "afr-messages.h"
+#define CHILD_UP_STR "UP"
+#define CHILD_DOWN_STR "DOWN"
+#define CHILD_DISCONNECTED_STR "DOWN"
+
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *fastest_children);
+
call_frame_t *
afr_copy_frame (call_frame_t *base)
{
@@ -1371,21 +1378,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
sizeof(gfid_copy)) % child_count;
}
+/*
+ * afr_halo_read_subvol
+ *
+ * Given a array representing the readable children, this function will
+ * return which one of the readable children meet the halo hybrid criteria.
+ * In the event none are found, -1 is returned and another strategy will have
+ * to be used to figure out where the read should come from.
+ */
+int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) {
+ afr_private_t *priv = NULL;
+ unsigned char *hybrid_children;
+ int32_t hybrid_cnt = 0;
+ int read_subvol = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ /* Halo in-active or hybrid mode disabled, bail.... */
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return -1;
+
+ /* AFR Discovery edge case, if you are already pinned to a child
+ * which meets the latency threshold then go with this child for
+ * consistency purposes.
+ */
+ if (priv->read_child >= 0 && readable[priv->read_child] &&
+ priv->child_latency[priv->read_child] <=
+ AFR_HALO_HYBRID_LATENCY_MSEC) {
+ return priv->read_child;
+ }
+
+ hybrid_children = alloca0 (priv->child_count);
+ hybrid_cnt = find_hybrid_children (this, hybrid_children);
+ if (hybrid_cnt) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i] && hybrid_children[i]) {
+ read_subvol = i;
+ priv->read_child = read_subvol;
+ gf_log (this->name, GF_LOG_TRACE,
+ "Selected hybrid child %d for reads",
+ i);
+ break;
+ }
+ }
+ }
+
+ return read_subvol;
+}
+
int
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
unsigned char *readable,
afr_read_subvol_args_t *args)
{
- int i = 0;
- int read_subvol = -1;
- afr_private_t *priv = NULL;
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
afr_read_subvol_args_t local_args = {0,};
- priv = this->private;
+ priv = this->private;
+
+ /* Choose lowest latency child for reads */
+ read_subvol = afr_halo_read_subvol (this, readable);
+ if (read_subvol != -1)
+ return read_subvol;
- /* first preference - explicitly specified or local subvolume */
- if (priv->read_child >= 0 && readable[priv->read_child])
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
return priv->read_child;
if (inode_is_linked (inode)) {
@@ -1411,7 +1472,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
return -1;
}
-
int
afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
unsigned char *readable, int *event_p,
@@ -2071,6 +2131,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv->children[child_index]->name);
priv->read_child = child_index;
+ } else if (priv->halo_enabled) {
+ if (priv->read_child < 0) {
+ priv->read_child = child_index;
+ } else if (priv->child_latency[child_index] <
+ priv->child_latency[priv->read_child]) {
+ priv->read_child = child_index;
+ }
}
out:
STACK_DESTROY(frame->root);
@@ -2262,7 +2329,6 @@ unwind:
return 0;
}
-
int
afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
{
@@ -2488,6 +2554,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int call_count = 0;
+ unsigned char *hybrid_children = NULL;
local = frame->local;
priv = this->private;
@@ -2498,8 +2565,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
goto out;
}
- call_count = local->call_count = AFR_COUNT (local->child_up,
- priv->child_count);
+ hybrid_children = alloca0 (priv->child_count);
+ call_count = find_hybrid_children (this, hybrid_children);
+ if (call_count) {
+ for (i = 0; i < priv->child_count; i++)
+ local->child_up[i] = hybrid_children[i];
+ gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid "
+ "children for LOOKUPs", call_count);
+ } else {
+ hybrid_children = NULL;
+ call_count = AFR_COUNT (local->child_up, priv->child_count);
+ }
+
+ local->call_count = call_count;
ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
&local->loc);
@@ -2732,6 +2810,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
+ /* So this is the "secret" to why "Hybrid" halo works. Encoded in
+ * the cached inodes, we store what is effectively the "generational"
+ * state of the cluster along with a "packed" version of the extended
+ * attributes which determine which nodes are wise/fools. We can
+ * consult these cached values to figure out who we can trust, in the
+ * event the state of our cluster changes and we can no longer trust
+ * the cached info we "refresh" the inode (and hit all regions) to
+ * ensure we know which bricks we can safely read from.
+ */
if (event != local->event_generation)
afr_inode_refresh (frame, this, loc->parent, NULL,
afr_lookup_do);
@@ -2956,7 +3043,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_UNWIND (flush, frame, local->op_ret,
@@ -4204,25 +4291,569 @@ __get_heard_from_all_status (xlator_t *this)
return heard_from_all;
}
+/*
+ * afr_cmp_child
+ *
+ * Passed to the qsort function to order a list of children by the latency
+ * and/or up/down states.
+ *
+ * Note: This isn't as simple as taking the latencies and calling it a
+ * a day. Children can be marked down, which overrides their latency
+ * signal. Having a lower-latency child available doesn't guarentee this
+ * child shall be marked up: we don't want to constantly be swapping
+ * slightly better bricks for others...this is jarring to clients and
+ * could cause all sorts of issues. Plus, the fail-over, max-replicas
+ * flags must all be honored which manage the up/down state of children.
+ *
+ * In short, the (as marked) up/down down state of the brick shall always
+ * take precedence when sorting by latency.
+ */
+static int
+_afr_cmp_child (const void *child1, const void *child2)
+{
+ struct afr_child *child11 = (struct afr_child *)child1;
+ struct afr_child *child22 = (struct afr_child *)child2;
+
+ /* If both children are _marked_ down they are equal */
+ if (!child11->child_up && !child22->child_up)
+ return 0;
+
+ /* Prefer child 2, child 1 is _marked_ down, child 2 is not */
+ if (!child11->child_up && child22->child_up)
+ return 1;
+
+ /* Prefer child 1, child 2 is _marked_ down, child 1 is not */
+ if (child11->child_up && !child22->child_up)
+ return -1;
+
+ if (child11->latency > child22->latency) {
+ return 1;
+ }
+ if (child11->latency == child22->latency) {
+ return 0;
+ }
+ return -1;
+}
+
+/*
+ * find_hybrid_children
+ *
+ * Given a char array representing our children (aka bricks within our AFR
+ * AFR "subvolume"), we'll mark this array with the children which are
+ * within the halo_hybrid_read_max_latency_sec or if none fit this condition,
+ * we'll pick the fastest two bricks.
+ *
+ * You might ask, why not just pick the quickest brick and be done with it?
+ * Well, being within our set is not suffcient to be chosen for the read,
+ * we must also be marked "readable", we still want to choose as many as
+ * we can within our local region to ensure we have somebody that is readable.
+ *
+ * To illustrate this, consider the case where a 1/2 bricks received a sync
+ * from some other writer, and the 2nd brick although faster wasn't present.
+ * In this case we'll want to use the slower brick to service the read.
+ *
+ * In short, this function just tells the caller which hybrid children,
+ * it gives no signal as to their readability, nor should it since this is
+ * handled later in the various flows (e.g. by afr_halo_read_subvol).
+ */
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *hybrid_children)
+{
+ int32_t i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_child *sorted_list = NULL;
+ uint32_t max_latency;
+ uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT;
+
+ priv = this->private;
+
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return 0;
+
+ if (limit > priv->child_count)
+ limit = priv->child_count;
+
+ max_latency = priv->halo_hybrid_read_max_latency_msec;
+
+ sorted_list = alloca (sizeof (struct afr_child) * priv->child_count);
+
+ /* Find children meeting the latency threshold */
+ for (i = 0; i < priv->child_count; i++) {
+ sorted_list[i].idx = i;
+ sorted_list[i].child_up = priv->child_up[i];
+ sorted_list[i].latency = priv->child_latency[i];
+ }
+
+ /* QuickSort the children according to latency */
+ qsort (sorted_list, priv->child_count, sizeof (struct afr_child),
+ _afr_cmp_child);
+
+ i = 0;
+ while (i < priv->child_count && sorted_list[i].latency <= max_latency)
+ hybrid_children[sorted_list[i++].idx] = 1;
+
+ /* Found some candidates */
+ if (i != 0)
+ return i;
+
+ /* If no candidates can be found meeting the max_latency threshold
+ * then find the best of those we have to our limit.
+ */
+ for (i = 0; i < limit; i++)
+ hybrid_children[sorted_list[i].idx] = 1;
+
+ return i;
+}
+
+int
+find_best_down_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
+ }
+ }
+ if (best_child >= 0) {
+ gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) "
+ "@ %ld ms latency", best_child, best_latency);
+ }
+ return best_child;
+}
+
+int
+find_worst_up_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] >= worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
+ }
+ }
+ if (worst_child >= 0) {
+ gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)"
+ " @ %ld ms latency", worst_child, worst_latency);
+ }
+ return worst_child;
+}
+
+static const char *halo_state_str(int i)
+{
+ switch (i) {
+ case 0: return "DOWN";
+ case 1: return "UP";
+ }
+
+ return "unknown";
+}
+
+
+static void dump_halo_states (xlator_t *this) {
+ afr_private_t *priv = NULL;
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (N/A)",
+ i,
+ halo_state_str(priv->child_up[i]));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (%"PRIi64" ms)",
+ i,
+ halo_state_str(priv->child_up[i]),
+ priv->child_latency[i]);
+ }
+ }
+}
+
+static void
+_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, const int64_t halo_max_latency_msec,
+ int32_t *event, int64_t *child_latency_msec,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int best_down_child = 0;
+ uint64_t latency_samples = 0;
+
+ priv = this->private;
+
+ /* Base it off the _minimum_ latency we've ever seen */
+ *child_latency_msec = child_xlator->client_latency.min / 1000.0;
+ latency_samples = child_xlator->client_latency.count;
+ priv->child_latency[idx] = *child_latency_msec;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] == 1) {
+ up_children++;
+ }
+ }
+
+ /* Don't do anything until you have some minimum numbner of
+ * latency samples */
+ if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) {
+ gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient "
+ " number of latency samples (%" PRIu64
+ " < %d), halo in-active.",
+ latency_samples, priv->halo_min_samples);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)"
+ " up_count %d (min %d) enabled %s",
+ idx, child_xlator ? child_xlator->name : "<null>",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ up_children,
+ priv->halo_min_replicas,
+ child_halo_enabled ? "true" : "false");
+
+ /*
+ * Case 1: This child's latency exceeds the maximum allowable
+ * for this halo.
+ */
+ if (child_halo_enabled &&
+ *child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 &&
+ up_children > priv->halo_min_replicas) {
+ if (find_worst_up_child (this) == idx) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "marking child down, "
+ "min_replicas (%d) still "
+ "satisfied.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_min_replicas);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ /*
+ * Case 2: Child latency is within halo and currently marked down,
+ * mark it up.
+ */
+ } else if ((child_halo_enabled == _gf_false ||
+ *child_latency_msec <= halo_max_latency_msec) &&
+ priv->child_up[idx] == 0) {
+ if (child_halo_enabled == _gf_false ||
+ up_children < priv->halo_max_replicas) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%ld ms) "
+ "below halo threshold (%ld) or halo is "
+ "disabled, marking child up.",
+ *child_latency_msec,
+ halo_max_latency_msec);
+ *event = GF_EVENT_CHILD_UP;
+ } else {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.", idx,
+ priv->halo_max_replicas);
+ }
+ /*
+ * Case 3: Child latency is within halo,and currently marked up,
+ * mark it down if it's the highest latency child and the
+ * number of up children is greater than halo_max_replicas.
+ * UNLESS you are an SHD in which case do nothing.
+ */
+ } else if ((child_halo_enabled == _gf_true &&
+ *child_latency_msec <= halo_max_latency_msec) &&
+ priv->child_up[idx] == 1) {
+ if (find_worst_up_child (this) == idx &&
+ up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "but halo_max_replicas (%d) exceeded, "
+ "marking child down.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_max_replicas);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ }
+
+ if (*event != GF_EVENT_CHILD_PING &&
+ gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:");
+ dump_halo_states (this);
+ }
+}
+
+void
+_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t halo_max_latency_msec,
+ int32_t *event, int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int worst_up_child = -1;
+ gf_boolean_t was_down = _gf_false;
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ /*
+ * Track the fact we did this, we may need to repeal this
+ * if we later decide to mark this brick down.
+ */
+ was_down = _gf_true;
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (child_halo_enabled == _gf_true &&
+ up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] >
+ halo_max_latency_msec) {
+ if (was_down == _gf_true)
+ priv->event_generation--;
+ *call_psh = 0;
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Marking child %d down, "
+ "doesn't meet halo threshold "
+ "(%ld), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child,
+ halo_max_latency_msec,
+ priv->halo_min_replicas);
+ goto out;
+ }
+ }
+ if (priv->halo_enabled &&
+ up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ if (was_down == _gf_true)
+ priv->event_generation--;
+ *call_psh = 0;
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
+ }
+ priv->child_up[worst_up_child] = 0;
+ gf_log (this->name, GF_LOG_INFO,
+ "Marking child %d down, "
+ "up_children (%d) > "
+ "halo_max_replicas (%d)",
+ worst_up_child,
+ up_children,
+ priv->halo_max_replicas);
+ up_children--;
+ goto out;
+ }
+out:
+ if (up_children == 1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ } else {
+ *event = GF_EVENT_CHILD_MODIFIED;
+ }
+
+ priv->last_event[idx] = *event;
+
+ if (gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "New halo states:");
+ dump_halo_states (this);
+ }
+}
+
+void
+_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
+ int idx, int64_t child_latency_msec,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+ gf_boolean_t swap_child = _gf_false;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to
+ * indicate the child is really disconnected.
+ */
+ if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) {
+ priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY;
+ }
+ priv->child_up[idx] = 0;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ best_down_child = find_best_down_child (this);
+ if (child_halo_enabled == _gf_true) {
+ if (up_children < priv->halo_min_replicas &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ else if (up_children < priv->halo_max_replicas &&
+ priv->child_latency[best_down_child] <=
+ halo_max_latency_msec &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ }
+
+ if (swap_child) {
+ if (best_down_child >= 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Swapping out child %d for "
+ "child %d to satisfy "
+ "halo_min_replicas (%d).",
+ idx, best_down_child,
+ priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "All subvolumes are down. Going "
+ "offline until atleast one of them "
+ "comes back up.");
+ } else {
+ *event = GF_EVENT_CHILD_MODIFIED;
+ }
+ priv->last_event[idx] = *event;
+
+ if (gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "New halo states:");
+ dump_halo_states (this);
+ }
+}
+
+int64_t
+_afr_get_halo_latency (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
+
+ priv = this->private;
+
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec =
+ priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld",
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
+}
+
+
int32_t
afr_notify (xlator_t *this, int32_t event,
void *data, void *data2)
{
+ xlator_t *child_xlator = NULL;
afr_private_t *priv = NULL;
int i = -1;
- int up_children = 0;
- int down_children = 0;
int propagate = 0;
int had_heard_from_all = 0;
int have_heard_from_all = 0;
int idx = -1;
int ret = -1;
int call_psh = 0;
+ int up_child = -1;
+ uint64_t latency_samples = 0;
dict_t *input = NULL;
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY;
+ gf_boolean_t child_halo_enabled = _gf_false;
+ child_xlator = (xlator_t *)data;
priv = this->private;
if (!priv)
@@ -4235,7 +4866,7 @@ afr_notify (xlator_t *this, int32_t event,
* O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
*/
priv->did_discovery = _gf_false;
-
+ latency_samples = child_xlator->client_latency.count;
/* parent xlators dont need to know about every child_up, child_down
* because of afr ha. If all subvolumes go down, child_down has
@@ -4246,7 +4877,7 @@ afr_notify (xlator_t *this, int32_t event,
* subsequent revalidate lookup happens on all the dht's subvolumes
* which triggers afr self-heals if any.
*/
- idx = find_child_index (this, data);
+ idx = find_child_index (this, child_xlator);
if (idx < 0) {
gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
"Received child_up from invalid subvolume");
@@ -4255,6 +4886,28 @@ afr_notify (xlator_t *this, int32_t event,
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
this);
+
+ if (!priv->halo_enabled ||
+ latency_samples < priv->halo_min_samples) {
+ child_halo_enabled = _gf_false;
+ halo_max_latency_msec = INT64_MAX;
+ } else {
+ child_halo_enabled = _gf_true;
+ halo_max_latency_msec = _afr_get_halo_latency (this);
+ }
+
+ if (event == GF_EVENT_CHILD_PING) {
+ /* Calculates the child latency and sets event
+ */
+ LOCK (&priv->lock);
+ {
+ _afr_handle_ping_event (this, child_xlator, idx,
+ halo_max_latency_msec, &event,
+ &child_latency_msec, child_halo_enabled);
+ }
+ UNLOCK (&priv->lock);
+ }
+
if (event == GF_EVENT_TRANSLATOR_OP) {
LOCK (&priv->lock);
{
@@ -4281,52 +4934,16 @@ afr_notify (xlator_t *this, int32_t event,
propagate = 1;
break;
case GF_EVENT_CHILD_UP:
- /*
- * This only really counts if the child was never up
- * (value = -1) or had been down (value = 0). See
- * comment at GF_EVENT_CHILD_DOWN for a more detailed
- * explanation.
- */
- if (priv->child_up[idx] != 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 1;
-
- call_psh = 1;
- up_children = __afr_get_up_children_count (priv);
- if (up_children == 1) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SUBVOL_UP,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
- } else {
- event = GF_EVENT_CHILD_MODIFIED;
- }
-
- priv->last_event[idx] = event;
-
+ _afr_handle_child_up_event (this, child_xlator,
+ idx, halo_max_latency_msec, &event, &call_psh,
+ &up_child, child_halo_enabled);
break;
case GF_EVENT_CHILD_DOWN:
- if (priv->child_up[idx] == 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 0;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 0)
- down_children++;
- if (down_children == priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
- } else {
- event = GF_EVENT_SOME_CHILD_DOWN;
- }
-
- priv->last_event[idx] = event;
-
+ _afr_handle_child_down_event (this, child_xlator, idx,
+ child_latency_msec, halo_max_latency_msec,
+ &event, &call_psh, &up_child,
+ child_halo_enabled);
break;
case GF_EVENT_CHILD_CONNECTING:
@@ -4353,7 +4970,6 @@ afr_notify (xlator_t *this, int32_t event,
had come up, propagate CHILD_UP, but only this time
*/
event = GF_EVENT_CHILD_DOWN;
- up_children = __afr_get_up_children_count (priv);
for (i = 0; i < priv->child_count; i++) {
if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
event = GF_EVENT_CHILD_UP;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7f7962013d7..c7d6261b110 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -46,7 +46,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_spbc_timeout_t,
gf_afr_mt_spb_status_t,
gf_afr_mt_empty_brick_t,
- gf_afr_mt_end
+ gf_afr_mt_child_latency_t,
+ gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 4becfb835e8..87542799a5b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -371,7 +371,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_private_t *priv = NULL;
off_t off = 0;
- size_t block = 128 * 1024;
+ size_t block = 0;
int type = AFR_SELFHEAL_DATA_FULL;
int ret = -1;
call_frame_t *iter_frame = NULL;
@@ -383,6 +383,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
healed_sinks[ARBITER_BRICK_INDEX] = 0;
}
+ block = 128 * 1024 * priv->data_self_heal_window_size;
+
type = afr_data_self_heal_type_get (priv, healed_sinks, source,
replies);
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index c6ac5ebfd1b..4ac1d32f58a 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -58,6 +58,7 @@ typedef struct {
eh_t **statistics;
uint32_t max_threads;
uint32_t wait_qlength;
+ uint32_t halo_max_latency_msec;
} afr_self_heald_t;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 6f4783c9213..ae9b28c7fb4 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -176,6 +176,42 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("data-self-heal-algorithm",
priv->data_self_heal_algorithm, options, str, out);
+ GF_OPTION_RECONF ("halo-enabled",
+ priv->halo_enabled, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-failover-enabled",
+ priv->halo_failover_enabled, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-shd-max-latency",
+ priv->shd.halo_max_latency_msec, options, uint32,
+ out);
+
+ GF_OPTION_RECONF ("halo-nfsd-max-latency",
+ priv->nfsd.halo_max_latency_msec, options, uint32,
+ out);
+
+ GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options,
+ uint32, out);
+
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -396,6 +432,35 @@ init (xlator_t *this)
GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
+ GF_OPTION_INIT ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, bool, out);
+
+ GF_OPTION_INIT ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, uint32,
+ out);
+
+ GF_OPTION_INIT ("halo-enabled",
+ priv->halo_enabled, bool, out);
+
+ GF_OPTION_INIT ("halo-failover-enabled",
+ priv->halo_failover_enabled, bool, out);
+
+ GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ uint32, out);
+ GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec,
+ uint32, out);
+ GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32,
+ out);
+ GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32,
+ out);
+ GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32,
+ out);
+
+ GF_OPTION_INIT ("halo-nfsd-max-latency",
+ priv->nfsd.halo_max_latency_msec, uint32, out);
+
+ GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+
GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
@@ -445,17 +510,24 @@ init (xlator_t *this)
priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
gf_afr_mt_char);
- if (!priv->child_up) {
+
+ priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency),
+ child_count,
+ gf_afr_mt_child_latency_t);
+
+ if (!priv->child_up || !priv->child_latency) {
ret = -ENOMEM;
goto out;
}
- for (i = 0; i < child_count; i++)
+ for (i = 0; i < child_count; i++) {
+ priv->child_latency[i] = 0.0;
priv->child_up[i] = -1; /* start with unknown state.
this initialization needed
for afr_notify() to work
reliably
*/
+ }
priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
gf_afr_mt_xlator_t);
@@ -663,6 +735,85 @@ struct volume_options options[] = {
"jobs that can perform parallel heals in the "
"background."
},
+ { .key = {"halo-shd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .description = "Maximum latency for shd halo replication in msec."
+ },
+ { .key = {"halo-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .description = "Enable Halo (geo) replication mode."
+ },
+ { .key = {"halo-failover-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .description = "Enable x-halo failover: will allow failover "
+ "to bricks outside the client or daemons' halo "
+ "in an attempt to satisfy halo-min-replicas."
+ },
+ { .key = {"halo-nfsd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .description = "Maximum latency for nfsd halo replication in msec."
+ },
+ { .key = {"halo-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .description = "Maximum latency for halo replication in msec."
+ },
+ { .key = {"halo-hybrid-mode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enable hybrid sync mounts. When enabled, halo will "
+ "do write FOPs synchronously, and read FOPs will be "
+ "services in-region if the inode is clean/consistent."
+ "If no bricks can be found below "
+ "halo-hybrid-max-read-latency then the best 2 shall "
+ "be selected. This option can be used in "
+ "conjunction with all other halo options."
+ },
+ { .key = {"halo-hybrid-read-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "8",
+ .description = "Maximum latency hybrid mode will use to select "
+ "children for read FOPs. Don't tune this unless "
+ "you really know what you are doing (i.e. you've "
+ "read/understand the associated source code)."
+ },
+ { .key = {"halo-max-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .description = "The maximum number of halo replicas; replicas"
+ " beyond this value will be written asynchronously"
+ "via the SHD."
+ },
+ { .key = {"halo-min-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "2",
+ .description = "The minimum number of halo replicas, before adding "
+ "out of region replicas."
+ },
+ { .key = {"halo-min-samples"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "3",
+ .description = "The minimum number of halo latency samples, before "
+ "we start forming the halos."
+ },
{ .key = {"heal-wait-queue-length"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
@@ -802,6 +953,13 @@ struct volume_options options[] = {
"translator is running as part of self-heal-daemon "
"or not."
},
+ { .key = {"iam-nfs-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of an NFS daemon "
+ "or not."
+ },
{ .key = {"quorum-type"},
.type = GF_OPTION_TYPE_STR,
.value = { "none", "auto", "fixed"},
@@ -866,7 +1024,7 @@ struct volume_options options[] = {
},
{ .key = {"heal-timeout"},
.type = GF_OPTION_TYPE_INT,
- .min = 60,
+ .min = 5,
.max = INT_MAX,
.default_value = "600",
.description = "time interval for checking the need to self-heal "
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 4bffc30788a..d09aa6852c8 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -30,6 +30,9 @@
#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
+#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */
+#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */
+#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */
#define AFR_LOCKEE_COUNT_MAX 3
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
@@ -60,6 +63,17 @@ typedef enum {
AFR_FAV_CHILD_POLICY_MAX,
} afr_favorite_child_policy;
+struct afr_nfsd {
+ gf_boolean_t iamnfsd;
+ uint32_t halo_max_latency_msec;
+};
+
+struct afr_child {
+ uint32_t idx;
+ int64_t latency;
+ unsigned char child_up;
+};
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -71,6 +85,7 @@ typedef struct _afr_private {
inode_t *root_inode;
unsigned char *child_up;
+ int64_t *child_latency;
unsigned char *local;
char **pending_key;
@@ -141,8 +156,19 @@ typedef struct _afr_private {
gf_boolean_t ensure_durability;
char *sh_domain;
char *afr_dirty;
-
- afr_self_heald_t shd;
+ gf_boolean_t halo_enabled;
+
+ /* Halo geo-replication tunables */
+ gf_boolean_t halo_failover_enabled;
+ gf_boolean_t halo_hybrid_mode;
+ uint32_t halo_hybrid_read_max_latency_msec;
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
+ uint32_t halo_min_samples;
+
+ afr_self_heald_t shd;
+ struct afr_nfsd nfsd;
gf_boolean_t consistent_metadata;
uint64_t spb_choice_timeout;
diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/cluster/aha/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am
new file mode 100644
index 00000000000..006db127d28
--- /dev/null
+++ b/xlators/cluster/aha/src/Makefile.am
@@ -0,0 +1,18 @@
+
+xlator_LTLIBRARIES = aha.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+aha_la_LDFLAGS = -module -avoid-version
+
+aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c
+aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c
new file mode 100644
index 00000000000..3b2ca641de2
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-fops.c
@@ -0,0 +1,952 @@
+#include "aha-fops.h"
+
+static void
+__save_fop (struct aha_fop *fop, struct aha_conf *conf)
+{
+ list_add_tail (&fop->list, &conf->failed);
+}
+
+void
+save_fop (struct aha_fop *fop, struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __save_fop (fop, conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...) \
+ do { \
+ struct aha_fop *fop = aha_fop_new (); \
+ if (!fop) { \
+ gf_log (GF_AHA, GF_LOG_CRITICAL, \
+ "Allocation failed, terminating " \
+ "to prevent a hung mount."); \
+ assert (0); \
+ } \
+ fop->stub = fop_##type##_stub (frame, aha_##type, \
+ args); \
+ fop->frame = frame; \
+ frame->local = fop; \
+ STACK_WIND (frame, cbk, obj, fn, args); \
+ } while (0) \
+
+/*
+ * AHA_HANDLE_FOP_CBK
+ *
+ * 1) If the error returned is ENOTCONN *and* the timer that waits
+ * for the server to come back has not expired, store the fop to retry later.
+ * 2) If the timer waiting for the server has expired, just unwind.
+ * 3) If the error returned is something other than ENOTCONN, just unwind.
+ *
+ */
+#define AHA_HANDLE_FOP_CBK(type, frame, args ...) \
+ do { \
+ struct aha_conf *conf = frame->this->private; \
+ struct aha_fop *fop = frame->local; \
+ if (op_ret != 0 && op_errno == ENOTCONN && \
+ !aha_is_timer_expired (conf)) { \
+ gf_log (GF_AHA, GF_LOG_WARNING, \
+ "Got ENOTCONN from client, storing " \
+ "to retry later!"); \
+ save_fop (fop, conf); \
+ } else { \
+ AHA_DESTROY_LOCAL (frame); \
+ STACK_UNWIND_STRICT (type, frame, args); \
+ } \
+ } while (0) \
+
+int
+aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode,
+ buf, xdata, postparent);
+ return 0;
+}
+
+
+int
+aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup,
+ loc, xdata);
+ return 0;
+}
+
+
+int
+aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, stat, aha_stat_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat,
+ loc, xdata);
+ return 0;
+}
+
+
+int
+aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop,
+ postop, xdata);
+ return 0;
+}
+
+
+int
+aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+}
+
+
+int
+aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop,
+ postop, xdata);
+ return 0;
+}
+
+int
+aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+}
+
+
+int
+aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+
+int
+aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+
+int
+aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t mask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, access, aha_access_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->access,
+ loc, mask, xdata);
+ return 0;
+}
+
+
+int
+aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *path, struct iatt *sbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno,
+ path, sbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ size_t size, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readlink,
+ loc, size, xdata);
+ return 0;
+}
+
+
+int
+aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno,
+ inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno,
+ inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int
+aha_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+}
+
+
+int
+aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf,
+ preoldparent, postoldparent,
+ prenewparent, postnewparent, xdata);
+ return 0;
+}
+
+
+int
+aha_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, rename, aha_rename_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+
+int
+aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, link, aha_link_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+
+int
+aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, create, aha_create_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, open, aha_open_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+}
+
+int
+aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata);
+ return 0;
+}
+
+int
+aha_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readv, aha_readv_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+int
+aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count,
+ off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, writev, aha_writev_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
+ fd, vector, count, off, flags, iobref, xdata);
+ return 0;
+}
+
+
+int
+aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, flush, aha_flush_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush,
+ fd, xdata);
+ return 0;
+}
+
+
+int
+aha_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync,
+ fd, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+
+int
+aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fstat,
+ fd, xdata);
+ return 0;
+}
+
+
+int
+aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+}
+
+int
+aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsyncdir,
+ fd, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+
+int
+aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->statfs,
+ loc, xdata);
+ return 0;
+}
+
+
+
+int
+aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int
+aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+
+int
+aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->xattrop,
+ loc, flags, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fxattrop,
+ fd, flags, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int
+aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+
+int
+aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, lk, aha_lk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lk,
+ fd, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_inodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->inodelk,
+ volume, loc, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_finodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->finodelk,
+ volume, fd, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_entrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->entrylk,
+ volume, loc, basename, cmd, type, xdata);
+ return 0;
+}
+
+
+int
+aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fentrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fentrylk,
+ volume, fd, basename, cmd, type, xdata);
+ return 0;
+}
+
+int
+aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+
+int
+aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t off, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readdir,
+ fd, size, off, xdata);
+ return 0;
+}
+
+
+int
+aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+
+int
+aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readdirp,
+ fd, size, off, dict);
+ return 0;
+}
diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h
new file mode 100644
index 00000000000..b1fb9d38a80
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-fops.h
@@ -0,0 +1,360 @@
+#ifndef _AHA_FOPS_H
+#define _AHA_FOPS_H
+
+#include "aha.h"
+#include "aha-helpers.h"
+
+/* FOP functions */
+int
+aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+int
+aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+int
+aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata);
+
+int
+aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata);
+
+int
+aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
+
+int
+aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
+
+int
+aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata);
+
+int
+aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
+
+int
+aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
+
+int
+aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
+
+int
+aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata);
+
+int
+aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int
+aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int
+aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+int
+aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+int
+aha_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata);
+
+int
+aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+
+int
+aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int
+aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata);
+
+int
+aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int
+aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
+
+int
+aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata);
+
+int
+aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int
+aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+
+int
+aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+
+int
+aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int
+aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int
+aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+
+int
+aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata);
+
+int
+aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *lock,
+ dict_t *xdata);
+
+int
+aha_finodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata);
+
+int
+aha_entrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+
+int
+aha_fentrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+int
+aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata);
+
+int
+aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict);
+
+/* Callback functions */
+
+int
+aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent);
+
+int
+aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata);
+
+int
+aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata);
+
+int
+aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata);
+
+int
+aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+
+int
+aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+
+
+int
+aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+
+int
+aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *path, struct iatt *sbuf, dict_t *xdata);
+
+
+int
+aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+
+int
+aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+int
+aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+
+int
+aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+int
+aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+int
+aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata);
+
+int
+aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+int
+aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+int
+aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata);
+int
+aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref, dict_t *xdata);
+
+int
+aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata);
+int
+aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+int
+aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata);
+
+int
+aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata);
+int
+aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata);
+int
+aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata);
+
+int
+aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
+int
+aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
+
+#endif /* _AHA_FOPS_H */
diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c
new file mode 100644
index 00000000000..e3b713688d3
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-helpers.c
@@ -0,0 +1,46 @@
+#include "aha-helpers.h"
+
+struct aha_conf *aha_conf_new ()
+{
+ struct aha_conf *conf = NULL;
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf);
+ if (!conf)
+ goto err;
+
+ INIT_LIST_HEAD (&conf->failed);
+
+ LOCK_INIT (&conf->lock);
+err:
+ return conf;
+}
+
+void aha_conf_destroy (struct aha_conf *conf)
+{
+ LOCK_DESTROY (&conf->lock);
+ GF_FREE (conf);
+}
+
+struct aha_fop *aha_fop_new ()
+{
+ struct aha_fop *fop = NULL;
+
+ fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop);
+ if (!fop)
+ goto err;
+
+ INIT_LIST_HEAD (&fop->list);
+
+err:
+ return fop;
+}
+
+void aha_fop_destroy (struct aha_fop *fop)
+{
+ if (!fop)
+ return;
+
+ call_stub_destroy (fop->stub);
+ fop->stub = NULL;
+ GF_FREE (fop);
+}
diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h
new file mode 100644
index 00000000000..d9cf9b3295d
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-helpers.h
@@ -0,0 +1,23 @@
+#ifndef _AHA_HELPERS_H
+#define _AHA_HELPERS_H
+
+#include "aha.h"
+
+#define GF_AHA "aha"
+
+struct aha_conf *aha_conf_new ();
+
+void aha_conf_destroy (struct aha_conf *conf);
+
+struct aha_fop *aha_fop_new ();
+
+void aha_fop_destroy (struct aha_fop *fop);
+
+#define AHA_DESTROY_LOCAL(frame) \
+ do { \
+ struct aha_fop *fop = frame->local; \
+ aha_fop_destroy (fop); \
+ frame->local = NULL; \
+ } while (0) \
+
+#endif /* _AHA_HELPERS_H */
diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h
new file mode 100644
index 00000000000..117dda27e8b
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-mem-types.h
@@ -0,0 +1,22 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __AHA_MEM_TYPES_H__
+#define __AHA_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_aha_mem_types_ {
+ gf_aha_mt_begin_t = gf_common_mt_end + 1,
+ gf_aha_mt_conf,
+ gf_aha_mt_fop,
+ gf_aha_mt_end
+};
+#endif
diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c
new file mode 100644
index 00000000000..8810f913f42
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-retry.c
@@ -0,0 +1,524 @@
+#include "aha.h"
+#include "aha-helpers.h"
+#include "aha-retry.h"
+#include "aha-fops.h"
+
+/*
+ * AHA_RETRY_FOP:
+ *
+ * - We STACK_WIND the fop using the arguments in the call_stub.
+ * We use STACK_WIND because we need a *new* frame, since we already
+ * exhausted the existing frame with the original STACK_WIND.
+ *
+ * - After STACK_WIND completes, we can destroy this frame's local (which
+ * should be struct aha_fop *). The frame itself will get destroyed higher in
+ * the xlator graph, since its still part of the call stack.
+ */
+#define AHA_RETRY_FOP(fop, type, args ...) \
+ do { \
+ call_stub_t *stub = fop->stub; \
+ call_frame_t *frame = fop->frame; \
+ xlator_t *this = frame->this; \
+ STACK_WIND (frame, aha_##type##_cbk, this, \
+ this->fops->type, args); \
+ AHA_DESTROY_LOCAL (frame); \
+ } while (0) \
+
+#define AHA_UNWIND_FOP(fop, type) \
+ do { \
+ call_frame_t *frame = fop->frame; \
+ AHA_DESTROY_LOCAL (frame); \
+ default_##type##_failure_cbk (frame, ETIMEDOUT); \
+ } while (0) \
+
+void
+__aha_retry_force_unwind_fops (struct aha_conf *conf)
+{
+ struct aha_fop *fop = NULL;
+ struct aha_fop *tmp = NULL;
+ size_t ndrained = 0;
+
+ /*
+ * Drain the queue. After we finish the loop, the list
+ * must be empty.
+ */
+ list_for_each_entry_safe (fop, tmp, &conf->failed, list) {
+ list_del (&fop->list);
+ aha_force_unwind_fop (fop);
+ ndrained++;
+ }
+
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Force-unwound %"GF_PRI_SIZET" fops!", ndrained);
+
+ assert (list_empty (&conf->failed));
+}
+
+void
+aha_force_unwind_fops (struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __aha_retry_force_unwind_fops (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+void
+__aha_retry_failed_fops (struct aha_conf *conf)
+{
+ struct aha_fop *fop = NULL;
+ struct aha_fop *tmp = NULL;
+ size_t ndrained = 0;
+
+ /*
+ * Skip if the child is not up
+ */
+ if (!conf->child_up) {
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Waiting for child to come up before retrying.");
+ return;
+ }
+
+ /*
+ * Skip if the the queue is empty.
+ */
+ if (list_empty (&conf->failed)) {
+ gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry.");
+ }
+
+ /*
+ * Drain the queue. After we finish the loop, the list
+ * must be empty.
+ */
+ list_for_each_entry_safe (fop, tmp, &conf->failed, list) {
+ list_del (&fop->list);
+ aha_retry_fop (fop);
+ ndrained++;
+ }
+
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Drained %"GF_PRI_SIZET" fops!", ndrained);
+
+ assert (list_empty (&conf->failed));
+}
+
+
+void
+aha_retry_failed_fops (struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __aha_retry_failed_fops (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+void aha_retry_fop (struct aha_fop *fop)
+{
+ call_stub_t *stub = fop->stub;
+
+ switch (stub->fop) {
+ case GF_FOP_OPEN:
+ AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags,
+ stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_CREATE:
+ AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags,
+ stub->args.mode, stub->args.umask,
+ stub->args.fd,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_STAT:
+ AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_READLINK:
+ AHA_RETRY_FOP (fop, readlink, &stub->args.loc,
+ stub->args.size, stub->args.xdata);
+ break;
+
+ case GF_FOP_MKNOD:
+ AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode,
+ stub->args.rdev, stub->args.umask,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_MKDIR:
+ AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode,
+ stub->args.umask, stub->args.xdata);
+ break;
+
+ case GF_FOP_UNLINK:
+ AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_RMDIR:
+ AHA_RETRY_FOP (fop, rmdir, &stub->args.loc,
+ stub->args.flags, stub->args.xdata);
+ break;
+
+ case GF_FOP_SYMLINK:
+ AHA_RETRY_FOP (fop, symlink, stub->args.linkname,
+ &stub->args.loc, stub->args.umask,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_RENAME:
+ AHA_RETRY_FOP (fop, rename, &stub->args.loc,
+ &stub->args.loc2, stub->args.xdata);
+ break;
+
+ case GF_FOP_LINK:
+ AHA_RETRY_FOP (fop, link, &stub->args.loc,
+ &stub->args.loc2, stub->args.xdata);
+ break;
+
+ case GF_FOP_TRUNCATE:
+ AHA_RETRY_FOP (fop, truncate, &stub->args.loc,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_READ:
+ AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.flags,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_WRITE:
+ AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector,
+ stub->args.count, stub->args.offset,
+ stub->args.flags, stub->args.iobref,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_STATFS:
+ AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_FLUSH:
+ AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSYNC:
+ AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_SETXATTR:
+ AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr,
+ stub->args.flags, stub->args.xdata);
+ break;
+
+ case GF_FOP_GETXATTR:
+ AHA_RETRY_FOP (fop, getxattr, &stub->args.loc,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd,
+ stub->args.xattr, stub->args.flags,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_REMOVEXATTR:
+ AHA_RETRY_FOP (fop, removexattr, &stub->args.loc,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_OPENDIR:
+ AHA_RETRY_FOP (fop, opendir, &stub->args.loc,
+ stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSYNCDIR:
+ AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd,
+ stub->args.datasync, stub->args.xdata);
+ break;
+
+ case GF_FOP_ACCESS:
+ AHA_RETRY_FOP (fop, access, &stub->args.loc,
+ stub->args.mask, stub->args.xdata);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ AHA_RETRY_FOP (fop, ftruncate, stub->args.fd,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSTAT:
+ AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_LK:
+ AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_INODELK:
+ AHA_RETRY_FOP (fop, inodelk, stub->args.volume,
+ &stub->args.loc, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_FINODELK:
+ AHA_RETRY_FOP (fop, finodelk, stub->args.volume,
+ stub->args.fd, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_ENTRYLK:
+ AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc,
+ stub->args.name, stub->args.entrylkcmd,
+ stub->args.entrylktype, stub->args.xdata);
+ break;
+
+ case GF_FOP_FENTRYLK:
+ AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd,
+ stub->args.name, stub->args.entrylkcmd,
+ stub->args.entrylktype, stub->args.xdata);
+ break;
+
+ case GF_FOP_LOOKUP:
+ AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_READDIR:
+ AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_READDIRP:
+ AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_XATTROP:
+ AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype,
+ stub->args.xattr, stub->args.xdata);
+ break;
+
+ case GF_FOP_FXATTROP:
+ AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype,
+ stub->args.xattr, stub->args.xdata);
+ break;
+
+ case GF_FOP_SETATTR:
+ AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat,
+ stub->args.valid, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSETATTR:
+ AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat,
+ stub->args.valid, stub->args.xdata);
+ break;
+
+ default:
+ /* Some fops are not implemented yet:
+ *
+ * GF_FOP_NULL
+ * GF_FOP_RCHECKSUM
+ * GF_FOP_FORGET
+ * GF_FOP_RELEASE
+ * GF_FOP_RELEASEDIR
+ * GF_FOP_GETSPEC
+ * GF_FOP_FALLOCATE
+ * GF_FOP_DISCARD
+ * GF_FOP_ZEROFILL
+ * GF_FOP_MAXVALUE
+ *
+ */
+ gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s",
+ gf_fop_list[stub->fop]);
+ assert (0);
+ break;
+ }
+}
+
+void
+aha_force_unwind_fop (struct aha_fop *fop)
+{
+ call_stub_t *stub = fop->stub;
+
+ switch (stub->fop) {
+ case GF_FOP_OPEN:
+ AHA_UNWIND_FOP (fop, open);
+ break;
+
+ case GF_FOP_CREATE:
+ AHA_UNWIND_FOP (fop, create);
+ break;
+
+ case GF_FOP_STAT:
+ AHA_UNWIND_FOP (fop, stat);
+ break;
+
+ case GF_FOP_READLINK:
+ AHA_UNWIND_FOP (fop, readlink);
+ break;
+
+ case GF_FOP_MKNOD:
+ AHA_UNWIND_FOP (fop, mknod);
+ break;
+
+ case GF_FOP_MKDIR:
+ AHA_UNWIND_FOP (fop, mkdir);
+ break;
+
+ case GF_FOP_UNLINK:
+ AHA_UNWIND_FOP (fop, unlink);
+ break;
+
+ case GF_FOP_RMDIR:
+ AHA_UNWIND_FOP (fop, rmdir);
+ break;
+
+ case GF_FOP_SYMLINK:
+ AHA_UNWIND_FOP (fop, symlink);
+ break;
+
+ case GF_FOP_RENAME:
+ AHA_UNWIND_FOP (fop, rename);
+ break;
+
+ case GF_FOP_LINK:
+ AHA_UNWIND_FOP (fop, link);
+ break;
+
+ case GF_FOP_TRUNCATE:
+ AHA_UNWIND_FOP (fop, truncate);
+ break;
+
+ case GF_FOP_READ:
+ AHA_UNWIND_FOP (fop, readv);
+ break;
+
+ case GF_FOP_WRITE:
+ AHA_UNWIND_FOP (fop, writev);
+ break;
+
+ case GF_FOP_STATFS:
+ AHA_UNWIND_FOP (fop, statfs);
+ break;
+
+ case GF_FOP_FLUSH:
+ AHA_UNWIND_FOP (fop, flush);
+ break;
+
+ case GF_FOP_FSYNC:
+ AHA_UNWIND_FOP (fop, fsync);
+ break;
+
+ case GF_FOP_SETXATTR:
+ AHA_UNWIND_FOP (fop, setxattr);
+ break;
+
+ case GF_FOP_GETXATTR:
+ AHA_UNWIND_FOP (fop, getxattr);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ AHA_UNWIND_FOP (fop, fsetxattr);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ AHA_UNWIND_FOP (fop, fgetxattr);
+ break;
+
+ case GF_FOP_REMOVEXATTR:
+ AHA_UNWIND_FOP (fop, removexattr);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ AHA_UNWIND_FOP (fop, fremovexattr);
+ break;
+
+ case GF_FOP_OPENDIR:
+ AHA_UNWIND_FOP (fop, opendir);
+ break;
+
+ case GF_FOP_FSYNCDIR:
+ AHA_UNWIND_FOP (fop, fsyncdir);
+ break;
+
+ case GF_FOP_ACCESS:
+ AHA_UNWIND_FOP (fop, access);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ AHA_UNWIND_FOP (fop, ftruncate);
+ break;
+
+ case GF_FOP_FSTAT:
+ AHA_UNWIND_FOP (fop, fstat);
+ break;
+
+ case GF_FOP_LK:
+ AHA_UNWIND_FOP (fop, lk);
+ break;
+
+ case GF_FOP_INODELK:
+ AHA_UNWIND_FOP (fop, inodelk);
+ break;
+
+ case GF_FOP_FINODELK:
+ AHA_UNWIND_FOP (fop, finodelk);
+ break;
+
+ case GF_FOP_ENTRYLK:
+ AHA_UNWIND_FOP (fop, entrylk);
+ break;
+
+ case GF_FOP_FENTRYLK:
+ AHA_UNWIND_FOP (fop, fentrylk);
+ break;
+
+ case GF_FOP_LOOKUP:
+ AHA_UNWIND_FOP (fop, lookup);
+ break;
+
+ case GF_FOP_READDIR:
+ AHA_UNWIND_FOP (fop, readdir);
+ break;
+
+ case GF_FOP_READDIRP:
+ AHA_UNWIND_FOP (fop, readdirp);
+ break;
+
+ case GF_FOP_XATTROP:
+ AHA_UNWIND_FOP (fop, xattrop);
+ break;
+
+ case GF_FOP_FXATTROP:
+ AHA_UNWIND_FOP (fop, fxattrop);
+ break;
+
+ case GF_FOP_SETATTR:
+ AHA_UNWIND_FOP (fop, setattr);
+ break;
+
+ case GF_FOP_FSETATTR:
+ AHA_UNWIND_FOP (fop, fsetattr);
+ break;
+
+ default:
+ /* Some fops are not implemented yet,
+ * and this would never happen cause we wouldn't
+ * queue them (see the assert statement in aha_retry_fop())
+ */
+ break;
+ }
+}
diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h
new file mode 100644
index 00000000000..5c8f56bca97
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-retry.h
@@ -0,0 +1,12 @@
+#ifndef _AHA_RETRY_H
+#define _AHA_RETRY_H
+
+void aha_retry_failed_fops (struct aha_conf *conf);
+
+void aha_retry_fop (struct aha_fop *fop);
+
+void aha_force_unwind_fops (struct aha_conf *conf);
+
+void aha_force_unwind_fop (struct aha_fop *fop);
+
+#endif /* _AHA_RETRY_H */
diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c
new file mode 100644
index 00000000000..2135e47f37f
--- /dev/null
+++ b/xlators/cluster/aha/src/aha.c
@@ -0,0 +1,345 @@
+#include "aha-helpers.h"
+#include "aha-retry.h"
+#include "aha-fops.h"
+#include "aha.h"
+
+#include "syncop.h"
+
+
+int
+retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg)
+{
+ /* Nothing to do here ... */
+ return 0;
+}
+
+int
+retry_failed_fops (void *arg)
+{
+ xlator_t *this = NULL;
+
+ struct aha_conf *conf = NULL;
+
+ this = arg;
+ conf = this->private;
+
+ aha_retry_failed_fops (conf);
+
+ return 0;
+}
+
+void
+dispatch_fop_queue_drain (xlator_t *this)
+{
+ struct syncenv *env = NULL;
+ int ret = 0;
+
+ env = this->ctx->env;
+
+ ret = synctask_new (env, retry_failed_fops,
+ retry_failed_fops_cbk, NULL, this);
+ if (ret != 0) {
+ gf_log (GF_AHA, GF_LOG_CRITICAL,
+ "Failed to dispatch synctask "
+ "to drain fop queue!");
+ }
+}
+
+inline void
+__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired)
+{
+ conf->timer_expired = expired;
+}
+
+inline gf_boolean_t
+__aha_is_timer_expired (struct aha_conf *conf)
+{
+ return conf->timer_expired;
+}
+
+gf_boolean_t
+aha_is_timer_expired (struct aha_conf *conf)
+{
+ gf_boolean_t expired = _gf_false;
+
+ LOCK (&conf->lock);
+ {
+ expired = __aha_is_timer_expired (conf);
+ }
+ UNLOCK (&conf->lock);
+
+ return expired;
+}
+
+void
+aha_child_down_timer_expired (void *data)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = data;
+
+ gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!");
+
+ LOCK (&conf->lock);
+ {
+ __aha_set_timer_status (conf, _gf_true);
+ }
+ UNLOCK (&conf->lock);
+
+ aha_force_unwind_fops ((struct aha_conf *)data);
+}
+
+void
+__aha_start_timer (struct aha_conf *conf)
+{
+ struct timespec child_down_timeout = {
+ .tv_sec = conf->server_wait_timeout,
+ .tv_nsec = 0
+ };
+
+ __aha_set_timer_status (conf, _gf_false);
+
+ conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout,
+ aha_child_down_timer_expired, conf);
+ if (!conf->timer) {
+ gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!");
+ }
+
+ gf_log (GF_AHA, GF_LOG_INFO,
+ "Registered timer for %lu seconds.",
+ conf->server_wait_timeout);
+}
+
+void
+__aha_cancel_timer (struct aha_conf *conf)
+{
+ if (!conf->timer)
+ goto out;
+
+ gf_timer_call_cancel (conf->this->ctx, conf->timer);
+ conf->timer = NULL;
+ gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!");
+out:
+ return;
+}
+
+void
+__aha_update_child_status (struct aha_conf *conf, int status)
+{
+ conf->child_up = status;
+}
+
+void
+aha_handle_child_up (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ LOCK (&conf->lock);
+ {
+ __aha_update_child_status (
+ conf, AHA_CHILD_STATUS_UP); /* Mark the child as up */
+ __aha_set_timer_status (
+ conf, _gf_false); /* Timer is no longer expired */
+ __aha_cancel_timer (conf); /* Cancel the timer */
+ }
+ UNLOCK (&conf->lock);
+}
+
+void
+aha_handle_child_down (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ LOCK (&conf->lock);
+ {
+ __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN);
+ __aha_set_timer_status (conf, _gf_true);
+ __aha_start_timer (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+int32_t
+notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ switch (event) {
+ case GF_EVENT_CHILD_DOWN:
+ gf_log (this->name, GF_LOG_WARNING, "Got child-down event!");
+ aha_handle_child_down (this);
+ break;
+ case GF_EVENT_CHILD_UP:
+ gf_log (this->name, GF_LOG_WARNING, "Got child-up event!");
+ aha_handle_child_up (this);
+ dispatch_fop_queue_drain (this);
+ break;
+ default:
+ break;
+ }
+
+ default_notify (this, event, data);
+
+ return 0;
+}
+
+int32_t
+aha_priv_dump (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init failed!");
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = this->private;
+
+ GF_OPTION_RECONF ("server-wait-timeout-seconds",
+ conf->server_wait_timeout,
+ options, size_uint64, err);
+
+ return 0;
+err:
+ return -1;
+}
+
+int
+aha_init_options (xlator_t *this)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = this->private;
+
+ GF_OPTION_INIT ("server-wait-timeout-seconds",
+ conf->server_wait_timeout,
+ size_uint64, err);
+
+ return 0;
+err:
+ return -1;
+}
+
+
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ struct aha_conf *conf = NULL;
+
+ conf = aha_conf_new ();
+ if (!conf) {
+ ret = -(ENOMEM);
+ goto err;
+ }
+
+ conf->this = this;
+ this->private = conf;
+
+ aha_init_options (this);
+
+ /* init() completed successfully */
+ goto done;
+err:
+ gf_log (GF_AHA, GF_LOG_ERROR,
+ "init() failed, please see "
+ "logs for details.");
+
+ /* Free all allocated memory */
+ aha_conf_destroy (conf);
+done:
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ aha_conf_destroy (conf);
+
+ this->private = NULL;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = aha_priv_dump,
+};
+
+struct xlator_fops cbks;
+
+struct xlator_fops fops = {
+ .lookup = aha_lookup,
+ .stat = aha_stat,
+ .readlink = aha_readlink,
+ .mknod = aha_mknod,
+ .mkdir = aha_mkdir,
+ .unlink = aha_unlink,
+ .rmdir = aha_rmdir,
+ .symlink = aha_symlink,
+ .rename = aha_rename,
+ .link = aha_link,
+ .truncate = aha_truncate,
+ .create = aha_create,
+ .open = aha_open,
+ .readv = aha_readv,
+ .writev = aha_writev,
+ .statfs = aha_statfs,
+ .flush = aha_flush,
+ .fsync = aha_fsync,
+ .setxattr = aha_setxattr,
+ .getxattr = aha_getxattr,
+ .removexattr = aha_removexattr,
+ .fsetxattr = aha_fsetxattr,
+ .fgetxattr = aha_fgetxattr,
+ .fremovexattr = aha_fremovexattr,
+ .opendir = aha_opendir,
+ .readdir = aha_readdir,
+ .readdirp = aha_readdirp,
+ .fsyncdir = aha_fsyncdir,
+ .access = aha_access,
+ .ftruncate = aha_ftruncate,
+ .fstat = aha_fstat,
+ .lk = aha_lk,
+ .lookup_cbk = aha_lookup_cbk,
+ .xattrop = aha_xattrop,
+ .fxattrop = aha_fxattrop,
+ .inodelk = aha_inodelk,
+ .finodelk = aha_finodelk,
+ .entrylk = aha_entrylk,
+ .fentrylk = aha_fentrylk,
+ .setattr = aha_setattr,
+ .fsetattr = aha_fsetattr,
+};
+
+struct volume_options options[] = {
+ { .key = {"server-wait-timeout-seconds"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 10,
+ .max = 20 * 60,
+ .default_value = TOSTRING (120),
+ .description = "Specifies the number of seconds the "
+ "AHA translator will wait "
+ "for a CHILD_UP event before "
+ "force-unwinding the frames it has "
+ "currently stored for retry."
+ },
+ { .key = {NULL} }
+};
diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h
new file mode 100644
index 00000000000..3dbf3199776
--- /dev/null
+++ b/xlators/cluster/aha/src/aha.h
@@ -0,0 +1,46 @@
+#ifndef _AHA_H
+#define _AHA_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "statedump.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "list.h"
+#include "timer.h"
+
+#include "aha-mem-types.h"
+
+/* new() and destroy() functions for all structs can be found in
+ * aha-helpers.c
+ */
+struct aha_conf {
+ xlator_t *this;
+ uint8_t child_up;
+ gf_lock_t lock;
+ struct list_head failed;
+ gf_timer_t *timer;
+ gf_boolean_t timer_expired;
+ uint64_t server_wait_timeout;
+};
+
+struct aha_fop {
+ call_stub_t *stub; /* Only used to store function arguments */
+ call_frame_t *frame; /* Frame corresponding to this fop */
+ uint64_t tries;
+ struct list_head list;
+};
+
+enum {
+ AHA_CHILD_STATUS_DOWN = 0,
+ AHA_CHILD_STATUS_UP = 1,
+ AHA_CHILD_STATUS_MAX
+};
+
+gf_boolean_t aha_is_timer_expired (struct aha_conf *conf);
+
+#endif
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index a9714b02b79..a97d03bb055 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -5559,6 +5559,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
{
dht_local_t *local = NULL;
xlator_t *avail_subvol = NULL;
+ int op_errno = 0;
local = frame->local;
@@ -5571,9 +5572,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
subvol, subvol->fops->mknod, loc, mode,
rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
-
- if (avail_subvol != subvol) {
+ /* This will return NULL if all subvolumes are full
+ * and/or no subvolume needs the min_free_disk limit
+ */
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->rdev = rdev;
local->mode = mode;
@@ -5603,6 +5610,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
}
out:
return 0;
+err:
+ return op_errno;
}
int32_t
@@ -6242,8 +6251,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
}
}
- dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode,
- umask, params);
+ op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc,
+ rdev, mode, umask,
+ params);
+ if (op_errno != 0) {
+ goto err;
+ }
done:
return 0;
@@ -6738,6 +6751,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
{
dht_local_t *local = NULL;
xlator_t *avail_subvol = NULL;
+ int op_errno = 0;
local = frame->local;
@@ -6752,8 +6766,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
} else {
avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
-
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
local->mode = mode;
@@ -6780,6 +6796,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
}
out:
return 0;
+err:
+ DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return op_errno;
}
int
@@ -6882,9 +6902,10 @@ dht_create_do (call_frame_t *frame)
goto err;
}
- dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc,
- local->flags, local->mode,
- local->umask, local->fd, local->params);
+ dht_create_wind_to_avail_subvol (frame, this, subvol,
+ &local->loc, local->flags,
+ local->mode, local->umask,
+ local->fd, local->params);
return 0;
err:
local->refresh_layout_unlock (frame, this, -1, 1);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 9e9ca712417..613a9d39816 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -300,6 +300,7 @@ struct dht_du {
uint64_t avail_space;
uint32_t log;
uint32_t chunks;
+ gf_boolean_t is_full;
};
typedef struct dht_du dht_du_t;
@@ -484,6 +485,7 @@ struct dht_conf {
dht_du_t *du_stats;
double min_free_disk;
double min_free_inodes;
+ gf_boolean_t min_free_strict_mode;
char disk_unit;
int32_t refresh_interval;
gf_boolean_t unhashed_sticky_bit;
@@ -549,6 +551,10 @@ struct dht_conf {
gf_boolean_t lock_migration_enabled;
gf_lock_t lock;
+
+ /* du stats */
+ uint32_t du_refresh_interval_sec;
+ gf_lock_t du_refresh_lock;
};
typedef struct dht_conf dht_conf_t;
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 1eb9e63c531..1b20dabc61f 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
call_frame_t *statfs_frame = NULL;
dht_local_t *statfs_local = NULL;
struct timeval tv = {0,};
+ struct timeval cmp_tv = {0,};
loc_t tmp_loc = {0,};
conf = this->private;
+ /* Somebody else is already refreshing the statfs info */
+ if (TRY_LOCK (&conf->du_refresh_lock) != 0)
+ return 0;
+
gettimeofday (&tv, NULL);
+ cmp_tv = conf->last_stat_fetch;
+ cmp_tv.tv_sec += conf->du_refresh_interval_sec;
+
/* make it root gfid, should be enough to get the proper
info back */
tmp_loc.gfid[15] = 1;
- if (tv.tv_sec > (conf->refresh_interval
- + conf->last_stat_fetch.tv_sec)) {
-
+ if (timercmp (&tv, &cmp_tv, >)) {
statfs_frame = copy_frame (frame);
if (!statfs_frame) {
goto err;
@@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
&tmp_loc, statfs_local->params);
}
- conf->last_stat_fetch.tv_sec = tv.tv_sec;
+ conf->last_stat_fetch = tv;
}
- return 0;
+ ret = 0;
+ goto out;
err:
if (statfs_frame)
DHT_STACK_DESTROY (statfs_frame);
- return -1;
+ ret = -1;
+out:
+ UNLOCK (&conf->du_refresh_lock);
+ return ret;
}
@@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
conf = this->private;
/* Check for values above specified percent or free disk */
- LOCK (&conf->subvolume_lock);
- {
+ if (TRY_LOCK (&conf->subvolume_lock) != 0) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ return conf->du_stats[i].is_full;
+ }
+ }
+ } else {
for (i = 0; i < conf->subvolume_cnt; i++) {
if (subvol == conf->subvolumes[i]) {
if (conf->disk_unit == 'p') {
@@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
}
}
}
- }
+
+ /* i will be less than subvolume_cnt if either of
+ * these booleans are true */
+ is_subvol_filled = (
+ subvol_filled_space || subvol_filled_inodes);
+ if (is_subvol_filled) {
+ conf->du_stats[i].is_full = is_subvol_filled;
+ }
+ }
UNLOCK (&conf->subvolume_lock);
if (subvol_filled_space && conf->subvolume_status[i]) {
@@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
}
}
- is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
-
return is_subvol_filled;
}
@@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,
layout);
- if(!avail_subvol)
- {
- avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol,
- layout);
- }
-
}
UNLOCK (&conf->subvolume_lock);
out:
@@ -325,7 +339,6 @@ out:
gf_msg_debug (this->name, 0,
"No subvolume has enough free space \
and/or inodes to create");
- avail_subvol = subvol;
}
if (layout)
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index 549f1b9ea7e..e320109c796 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
int op_errno = -1;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ conf = this->private;
+
+ if (conf->min_free_strict_mode == _gf_true)
+ dht_get_du_info (frame, this, loc);
local = dht_local_init (frame, loc, fd, GF_FOP_OPEN);
if (!local) {
@@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this,
"no cached subvolume for fd=%p", fd);
op_errno = EINVAL;
goto err;
+ } else if (conf->min_free_strict_mode == _gf_true &&
+ dht_is_subvol_filled (this, subvol) == _gf_true &&
+ flags & O_APPEND) {
+ op_errno = ENOSPC;
+ goto err;
}
local->rebalance.flags = flags;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index 112685b659e..7420461da76 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
xlator_t *subvol = NULL;
int op_errno = -1;
dht_local_t *local = NULL;
+ loc_t *nil_loc = {0,};
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ conf = this->private;
+
+
local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE);
if (!local) {
@@ -173,15 +178,21 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto err;
}
+ if (conf->min_free_strict_mode == _gf_true)
+ dht_get_du_info (frame, this, nil_loc);
+
subvol = local->cached_subvol;
if (!subvol) {
gf_msg_debug (this->name, 0,
"no cached subvolume for fd=%p", fd);
op_errno = EINVAL;
goto err;
+ } else if (conf->min_free_strict_mode == _gf_true &&
+ dht_is_subvol_filled (this, subvol) == _gf_true) {
+ op_errno = ENOSPC;
+ goto err;
}
-
local->rebalance.vector = iov_dup (vector, count);
local->rebalance.offset = off;
local->rebalance.count = count;
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 1d145855ed7..10fd878041e 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -20,7 +20,7 @@
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
-#define DHT_REBALANCE_BLKSIZE (128 * 1024)
+#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
#define MAX_MIGRATE_QUEUE_COUNT 500
#define MIN_MIGRATE_QUEUE_COUNT 200
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 5c810f0dc77..ccbf66b626d 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options)
conf->disk_unit = 0;
if (conf->min_free_disk < 100.0)
conf->disk_unit = 'p';
+ GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode,
+ options, bool, out);
GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
percent, out);
@@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options,
bool, out);
+
+ GF_OPTION_RECONF ("du-refresh-interval-sec",
+ conf->du_refresh_interval_sec, options, uint32, out);
ret = 0;
out:
return ret;
@@ -720,7 +725,10 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
- err);
+ err);
+
+ GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode,
+ bool, err);
GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
err);
@@ -738,6 +746,11 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled,
bool, err);
+ GF_OPTION_INIT ("du-refresh-interval-sec",
+ conf->du_refresh_interval_sec, uint32, err);
+
+ LOCK_INIT (&conf->du_refresh_lock);
+
if (defrag) {
defrag->lock_migration_enabled = conf->lock_migration_enabled;
@@ -907,6 +920,14 @@ struct volume_options options[] = {
"process starts balancing out the cluster, and logs will appear "
"in log files",
},
+ { .key = {"min-free-strict-mode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "When enabled, will reject in-flight writes or "
+ "append operations to files when the target subvolume falls "
+ "below min-free-(disk|inodes). When disabled, these are allowed "
+ "through and only new files will be affected.",
+ },
{ .key = {"min-free-inodes"},
.type = GF_OPTION_TYPE_PERCENT,
.default_value = "5%",
@@ -1089,5 +1110,14 @@ struct volume_options options[] = {
" associated with a file during rebalance"
},
+ { .key = {"du-refresh-interval-sec"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "60",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies how many seconds before subvolume statfs "
+ "info is re-validated."
+ },
+
{ .key = {NULL} },
};
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 56e17d6e884..996faffa37f 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this,
local);
}
- if (subvol != avail_subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (subvol != avail_subvol) {
/* create a link file instead of actual file */
local->params = dict_ref (params);
local->mode = mode;
@@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
local);
}
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
/* Create linkfile first */
local->params = dict_ref (params);
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index f1e9a399442..8b14ac99b8f 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this,
local);
}
- if (subvol != avail_subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (subvol != avail_subvol) {
/* create a link file instead of actual file */
local->mode = mode;
local->flags = flags;
@@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local);
}
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
/* Create linkfile first */
local->params = dict_ref (params);
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index c21417a0192..69f182c5194 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -91,9 +91,13 @@ typedef struct _ios_sample_t {
uid_t uid;
gid_t gid;
char identifier[UNIX_PATH_MAX];
+ char path[UNIX_PATH_MAX];
glusterfs_fop_t fop_type;
struct timeval timestamp;
double elapsed;
+ gf_boolean_t have_path;
+ int32_t op_ret;
+ int32_t op_errno;
} ios_sample_t;
@@ -178,10 +182,33 @@ typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*,
int , int , uint64_t ) ;
struct ios_local {
- struct timeval wind_at;
- struct timeval unwind_at;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
};
+static struct ios_local *
+ios_local_new() {
+ return GF_CALLOC (1, sizeof (struct ios_local),
+ gf_common_mt_char);
+}
+
+static void
+ios_local_free (struct ios_local *local)
+{
+ if (!local)
+ return;
+
+ inode_unref (local->inode);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ loc_wipe (&local->loc);
+ memset (local, 0, sizeof (*local));
+ GF_FREE (local);
+}
+
struct volume_options options[];
static int
@@ -192,6 +219,57 @@ is_fop_latency_started (call_frame_t *frame)
return memcmp (&frame->begin, &epoch, sizeof (epoch));
}
+static void
+ios_free_local (call_frame_t *frame)
+{
+ struct ios_local *local = frame->local;
+
+ ios_local_free (local);
+
+ frame->local = NULL;
+}
+
+static void
+ios_track_loc (call_frame_t *frame, loc_t *loc)
+{
+ struct ios_local *local = NULL;
+
+ if (loc && loc->path) {
+ /* Check if frame->local is already set (it should
+ * only be set by either ios_track_loc() or
+ * ios_track_fd()). In other words, this check
+ * allows us to chain calls to ios_track_loc()
+ * and ios_track_fd() without clobbering frame->local
+ * in the process.
+ */
+ if (frame->local) {
+ local = frame->local;
+ } else {
+ local = ios_local_new ();
+ }
+ loc_copy (&local->loc, loc);
+ frame->local = local;
+ }
+}
+
+static void
+ios_track_fd (call_frame_t *frame, fd_t *fd)
+{
+ struct ios_local *local = NULL;
+
+ if (fd && fd->inode) {
+ if (frame->local) {
+ local = frame->local;
+ } else {
+ local = ios_local_new ();
+ }
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+ frame->local = local;
+ }
+}
+
+
#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"
#ifdef GF_LINUX_HOST_OS
#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats"
@@ -206,7 +284,7 @@ is_fop_latency_started (call_frame_t *frame)
conf = this->private; \
if (conf && conf->measure_latency) { \
gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op); \
+ update_ios_latency (conf, frame, GF_FOP_##op, 0, 0); \
} \
} while (0)
@@ -244,7 +322,7 @@ is_fop_latency_started (call_frame_t *frame)
#define STATS_ADD(x,i) (x) += (i)
#endif
-#define UPDATE_PROFILE_STATS(frame, op) \
+#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \
do { \
struct ios_conf *conf = NULL; \
\
@@ -257,7 +335,8 @@ is_fop_latency_started (call_frame_t *frame)
conf->count_fop_hits) { \
BUMP_FOP(op); \
gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op);\
+ update_ios_latency (conf, frame, GF_FOP_##op, \
+ op_ret, op_errno); \
} \
} \
STATS_UNLOCK (&conf->lock); \
@@ -694,7 +773,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this,
int
_io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
- char *key_root = "gluster";
+ char *key_root = "storage.gluster";
char *xlator_name = NULL;
char *instance_name = NULL;
size_t key_len = 0;
@@ -719,7 +798,7 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
}
if (strcmp (__progname, "glusterfsd") == 0)
- key_root = "gluster.brick";
+ key_root = "storage.gluster.brick";
if (instance_name) {
/* +3 for 2 x "." + NULL */
@@ -1010,7 +1089,10 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
char *port_pos = NULL;
char *group_name = NULL;
char *username = NULL;
+ char *path = NULL;
struct ios_conf *conf = NULL;
+ const char *error_string = NULL;
+ int32_t op_errno = 0;
conf = this->private;
@@ -1057,12 +1139,22 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
sprintf (group_name, "%d", (int32_t)sample->gid);
}
+ path = "Unknown";
+ if (sample->have_path)
+ path = sample->path;
+
+ error_string = "No Error";
+ if (sample->op_ret != 0) {
+ op_errno = abs (sample->op_errno);
+ error_string = strerror (op_errno);
+ }
+
ios_log (this, logfp,
- "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s",
+ "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s",
epoch_time, fop_enum_to_pri_string (sample->fop_type),
fop_enum_to_string (sample->fop_type),
sample->elapsed, xlator_name, instance_name, username,
- group_name, hostname, port);
+ group_name, hostname, port, path, op_errno, error_string);
goto out;
err:
gf_log (this->name, GF_LOG_ERROR,
@@ -1608,14 +1700,87 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd)
return 0;
}
+void ios_local_get_inode (struct ios_local *local, inode_t **inode)
+{
+ if (!local)
+ return;
+
+ /* In the cases that a loc is given to us,
+ * we should use that as the source of truth
+ * for the inode.
+ */
+ if (local->loc.inode) {
+ *inode = local->loc.inode;
+ return;
+ }
+
+ /* Fall back to the inode in the local struct,
+ * but there is no guarantee this will be a valid
+ * pointer.
+ */
+ *inode = local->inode;
+}
+
+void ios_local_get_path (call_frame_t *frame, const char **path)
+{
+ struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ ios_local_get_inode (local, &inode);
+
+ if (inode) {
+ /* Each inode shold have an iosstat struct attached to it.
+ * This is the preferred way to retrieve the path.
+ */
+ ios_inode_ctx_get (inode, frame->this, &iosstat);
+ if (iosstat) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "[%s] Getting path from iostat struct",
+ fop_enum_to_string (frame->op));
+ *path = iosstat->filename;
+ goto out;
+ }
+ }
+
+ /* If we don't have the iosstat attached to the inode,
+ * fall back to retrieving the path via the loc struct
+ * inside the local.
+ */
+ if (local->loc.path) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "[%s] Getting path from loc_t",
+ fop_enum_to_string (frame->op));
+ *path = local->loc.path;
+ goto out;
+ }
+
+out:
+ /* If the inode and the loc don't have the path, we're out of luck.
+ */
+ if (!*path) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "Unable to get path for fop: %s",
+ fop_enum_to_string (frame->op));
+ }
+
+ return;
+}
+
void collect_ios_latency_sample (struct ios_conf *conf,
glusterfs_fop_t fop_type, double elapsed,
- call_frame_t *frame)
+ call_frame_t *frame, int32_t op_ret, int32_t op_errno)
{
+ struct ios_local *ios_local = NULL;
ios_sample_buf_t *ios_sample_buf = NULL;
ios_sample_t *ios_sample = NULL;
struct timeval *timestamp = NULL;
call_stack_t *root = NULL;
+ const char *path = NULL;
ios_sample_buf = conf->ios_sample_buf;
@@ -1630,6 +1795,8 @@ void collect_ios_latency_sample (struct ios_conf *conf,
ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);
ios_sample->elapsed = elapsed;
ios_sample->fop_type = fop_type;
+ ios_sample->op_ret = op_ret;
+ ios_sample->op_errno = op_errno;
ios_sample->uid = root->uid;
ios_sample->gid = root->gid;
(ios_sample->timestamp).tv_sec = timestamp->tv_sec;
@@ -1637,6 +1804,52 @@ void collect_ios_latency_sample (struct ios_conf *conf,
memcpy (&ios_sample->identifier, &root->identifier,
sizeof (root->identifier));
+ /* Eventually every FOP will be supported
+ * (i.e., the frame->local will be
+ * of type struct ios_local), but for now, this is a safety.
+ */
+ switch (ios_sample->fop_type) {
+
+ case GF_FOP_CREATE:
+ case GF_FOP_OPEN:
+ case GF_FOP_STAT:
+ case GF_FOP_FSTAT:
+ case GF_FOP_READ:
+ case GF_FOP_WRITE:
+ case GF_FOP_OPENDIR:
+ case GF_FOP_READDIRP:
+ case GF_FOP_READDIR:
+ case GF_FOP_FLUSH:
+ case GF_FOP_ACCESS:
+ case GF_FOP_UNLINK:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RMDIR:
+ case GF_FOP_SETATTR:
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FXATTROP:
+ case GF_FOP_XATTROP:
+ case GF_FOP_GETXATTR:
+ case GF_FOP_FGETXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_STATFS:
+ case GF_FOP_FSYNC:
+ ios_local_get_path (frame, &path);
+ break;
+ default:
+ path = NULL;
+ break;
+ }
+
+ if (path) {
+ strncpy (ios_sample->path, path, sizeof (ios_sample->path));
+ ios_sample->have_path = _gf_true;
+ }
+
/* We've reached the end of the circular buffer, start from the
* beginning. */
if (ios_sample_buf->pos == (ios_sample_buf->size - 1))
@@ -1674,7 +1887,7 @@ update_ios_latency_stats (struct ios_global_stats *stats, double elapsed,
int
update_ios_latency (struct ios_conf *conf, call_frame_t *frame,
- glusterfs_fop_t op)
+ glusterfs_fop_t op, int32_t op_ret, int32_t op_errno)
{
double elapsed;
struct timeval *begin, *end;
@@ -1687,7 +1900,7 @@ update_ios_latency (struct ios_conf *conf, call_frame_t *frame,
update_ios_latency_stats (&conf->cumulative, elapsed, op);
update_ios_latency_stats (&conf->incremental, elapsed, op);
- collect_ios_latency_sample (conf, op, elapsed, frame);
+ collect_ios_latency_sample (conf, op, elapsed, frame, op_ret, op_errno);
return 0;
}
@@ -1811,40 +2024,100 @@ unlock_list_head:
return ret;
}
+static int
+attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path,
+ const uuid_t gfid) {
+ struct ios_stat *iosstat = NULL;
+
+ if (!inode) {
+ return -EINVAL;
+ }
+
+ ios_inode_ctx_get (inode, this, &iosstat);
+ if (!iosstat) {
+ iosstat = GF_CALLOC (1, sizeof (*iosstat),
+ gf_io_stats_mt_ios_stat);
+ if (!iosstat) {
+ return -ENOMEM;
+ }
+ iosstat->filename = gf_strdup (path);
+ gf_uuid_copy (iosstat->gfid, gfid);
+ LOCK_INIT (&iosstat->lock);
+ ios_inode_ctx_set (inode, this, iosstat);
+ }
+
+ return 0;
+}
+
+
+int
+ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd)
+{
+ struct ios_fd *ifd = NULL;
+ int ret = 0;
+
+ ifd = GF_CALLOC (1, sizeof (*ifd), gf_io_stats_mt_ios_fd);
+ if (!ifd) {
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+
+ if (path) {
+ ifd->filename = gf_strdup (path);
+ if (!ifd->filename) {
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+ }
+
+ gettimeofday (&ifd->opened_at, NULL);
+
+ if (fd)
+ ios_fd_ctx_set (fd, this, ifd);
+
+ *iosfd = ifd;
+
+ return ret;
+
+ /* Failure path */
+free_and_out:
+ if (ifd) {
+ GF_FREE (ifd->filename);
+ GF_FREE (ifd);
+ }
+
+ *iosfd = NULL;
+
+ return ret;
+}
+
+
int
io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
- struct ios_stat *iosstat = NULL;
- struct ios_conf *conf = NULL;
-
- conf = this->private;
+ struct ios_local *local = NULL;
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
- path = frame->local;
- frame->local = NULL;
-
- if (!path)
+ if (op_ret < 0) {
goto unwind;
+ }
- if (op_ret < 0) {
- GF_FREE (path);
+ local = frame->local;
+ if (!local) {
goto unwind;
}
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
+ conf = this->private;
+
+ ios_build_fd (this, local->loc.path, fd, &iosfd);
if (!iosfd) {
- GF_FREE (path);
goto unwind;
}
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
-
- ios_fd_ctx_set (fd, this, iosfd);
LOCK (&conf->lock);
{
conf->cumulative.nr_opens++;
@@ -1855,18 +2128,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&conf->lock);
- iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat);
- if (!iosstat) {
- GF_FREE (path);
- goto unwind;
- }
- iosstat->filename = gf_strdup (path);
- gf_uuid_copy (iosstat->gfid, buf->ia_gfid);
- LOCK_INIT (&iosstat->lock);
- ios_inode_ctx_set (fd->inode, this, iosstat);
+ attach_iosstat_to_inode (this, local->loc.inode, local->loc.path,
+ buf->ia_gfid);
unwind:
- UPDATE_PROFILE_STATS (frame, CREATE);
+ UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -1877,44 +2144,24 @@ int
io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
- struct ios_stat *iosstat = NULL;
- struct ios_conf *conf = NULL;
-
- conf = this->private;
- path = frame->local;
- frame->local = NULL;
-
- if (!path)
- goto unwind;
+ struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
if (op_ret < 0) {
- GF_FREE (path);
goto unwind;
}
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
- if (!iosfd) {
- GF_FREE (path);
+ local = frame->local;
+ if (!local) {
goto unwind;
}
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
-
- ios_fd_ctx_set (fd, this, iosfd);
-
- ios_inode_ctx_get (fd->inode, this, &iosstat);
- if (!iosstat) {
- iosstat = GF_CALLOC (1, sizeof (*iosstat),
- gf_io_stats_mt_ios_stat);
- if (iosstat) {
- iosstat->filename = gf_strdup (path);
- gf_uuid_copy (iosstat->gfid, fd->inode->gfid);
- LOCK_INIT (&iosstat->lock);
- ios_inode_ctx_set (fd->inode, this, iosstat);
- }
+ conf = this->private;
+ ios_build_fd (this, local->loc.path, fd, &iosfd);
+ if (!iosfd) {
+ goto unwind;
}
LOCK (&conf->lock);
@@ -1926,13 +2173,19 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
UNLOCK (&conf->lock);
+
+ ios_inode_ctx_get (fd->inode, this, &iosstat);
if (iosstat) {
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN);
- iosstat = NULL;
}
-unwind:
- UPDATE_PROFILE_STATS (frame, OPEN);
+ attach_iosstat_to_inode (this, local->loc.inode,
+ local->loc.path,
+ local->loc.inode->gfid);
+
+unwind:
+ UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
return 0;
@@ -1943,7 +2196,8 @@ int
io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, STAT);
+ UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -1956,26 +2210,29 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iobref *iobref, dict_t *xdata)
{
int len = 0;
- fd_t *fd = NULL;
struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
- fd = frame->local;
- frame->local = NULL;
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
if (op_ret > 0) {
len = iov_length (vector, count);
- BUMP_READ (fd, len);
+ BUMP_READ (local->fd, len);
}
- UPDATE_PROFILE_STATS (frame, READ);
- ios_inode_ctx_get (fd->inode, this, &iosstat);
+ UPDATE_PROFILE_STATS (frame, READ, op_ret, op_errno);
+ ios_inode_ctx_get (local->fd->inode, this, &iosstat);
if (iosstat) {
- BUMP_STATS (iosstat, IOS_STATS_TYPE_READ);
- BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ);
- iosstat = NULL;
+ BUMP_STATS (iosstat, IOS_STATS_TYPE_READ);
+ BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ);
+
}
+unwind:
+ ios_free_local (frame);
STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
vector, count, buf, iobref, xdata);
return 0;
@@ -1989,21 +2246,23 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
inode_t *inode = NULL;
- UPDATE_PROFILE_STATS (frame, WRITE);
- if (frame->local){
- inode = frame->local;
- frame->local = NULL;
- ios_inode_ctx_get (inode, this, &iosstat);
- if (iosstat) {
- BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE);
- BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE);
- inode = NULL;
- iosstat = NULL;
- }
- }
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
+
+ UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno);
+
+ ios_inode_ctx_get (local->inode, this, &iosstat);
+ if (iosstat) {
+ BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE);
+ BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE);
+ }
+unwind:
+ ios_free_local (frame);
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
return 0;
@@ -2021,7 +2280,7 @@ io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
frame->local = NULL;
- UPDATE_PROFILE_STATS (frame, READDIRP);
+ UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno);
ios_inode_ctx_get (inode, this, &iosstat);
@@ -2039,7 +2298,16 @@ int
io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, READDIR);
+ struct ios_local *local = NULL;
+ struct ios_stat *iosstat = NULL;
+
+ local = frame->local;
+
+ UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);
+
+ ios_free_local (frame);
+
+ UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);
STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2050,8 +2318,10 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSYNC);
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
return 0;
}
@@ -2061,7 +2331,8 @@ io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SETATTR);
+ UPDATE_PROFILE_STATS (frame, SETATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, xdata);
return 0;
}
@@ -2072,7 +2343,8 @@ io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, UNLINK);
+ UPDATE_PROFILE_STATS (frame, UNLINK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
preparent, postparent, xdata);
return 0;
@@ -2086,7 +2358,7 @@ io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preoldparent, struct iatt *postoldparent,
struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, RENAME);
+ UPDATE_PROFILE_STATS (frame, RENAME, op_ret, op_errno);
STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
preoldparent, postoldparent,
prenewparent, postnewparent, xdata);
@@ -2099,7 +2371,8 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, const char *buf,
struct iatt *sbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, READLINK);
+ UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
return 0;
}
@@ -2111,7 +2384,14 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
dict_t *xdata, struct iatt *postparent)
{
- UPDATE_PROFILE_STATS (frame, LOOKUP);
+ struct ios_local *local = frame->local;
+
+ if (local && local->loc.path && inode && op_ret >= 0) {
+ attach_iosstat_to_inode (this, inode, local->loc.path,
+ inode->gfid);
+ }
+ UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,
postparent);
return 0;
@@ -2124,7 +2404,7 @@ io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SYMLINK);
+ UPDATE_PROFILE_STATS (frame, SYMLINK, op_ret, op_errno);
STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2137,7 +2417,7 @@ io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, MKNOD);
+ UPDATE_PROFILE_STATS (frame, MKNOD, op_ret, op_errno);
STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2151,28 +2431,16 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata)
{
- struct ios_stat *iosstat = NULL;
- char *path = frame->local;
+ struct ios_local *local = frame->local;
- if (!path)
- goto unwind;
-
- UPDATE_PROFILE_STATS (frame, MKDIR);
- if (op_ret < 0)
- goto unwind;
-
- iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat);
- if (iosstat) {
- LOCK_INIT (&iosstat->lock);
- iosstat->filename = gf_strdup(path);
- gf_uuid_copy (iosstat->gfid, buf->ia_gfid);
- ios_inode_ctx_set (inode, this, iosstat);
+ if (local && local->loc.path) {
+ local->inode = inode_ref (inode);
+ attach_iosstat_to_inode (this, inode, local->loc.path,
+ buf->ia_gfid);
}
-unwind:
- /* local is assigned with path */
- GF_FREE (frame->local);
- frame->local = NULL;
+ UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2185,7 +2453,7 @@ io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, LINK);
+ UPDATE_PROFILE_STATS (frame, LINK, op_ret, op_errno);
STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2196,7 +2464,8 @@ int
io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FLUSH);
+ UPDATE_PROFILE_STATS (frame, FLUSH, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2206,20 +2475,28 @@ int
io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- struct ios_stat *iosstat = NULL;
- int ret = -1;
+ struct ios_local *local = NULL;
+ struct ios_stat *iosstat = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
- UPDATE_PROFILE_STATS (frame, OPENDIR);
if (op_ret < 0)
goto unwind;
- ios_fd_ctx_set (fd, this, 0);
+ attach_iosstat_to_inode (this, local->inode, local->loc.path,
+ local->inode->gfid);
- ret = ios_inode_ctx_get (fd->inode, this, &iosstat);
- if (!ret)
+ ios_fd_ctx_set (local->fd, this, 0);
+ ios_inode_ctx_get (local->fd->inode, this, &iosstat);
+ if (iosstat)
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR);
unwind:
+ UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
return 0;
}
@@ -2231,8 +2508,8 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, RMDIR);
-
+ UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
preparent, postparent, xdata);
return 0;
@@ -2244,7 +2521,8 @@ io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, TRUNCATE);
+ UPDATE_PROFILE_STATS (frame, TRUNCATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
prebuf, postbuf, xdata);
return 0;
@@ -2255,7 +2533,8 @@ int
io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, STATFS);
+ UPDATE_PROFILE_STATS (frame, STATFS, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2265,7 +2544,8 @@ int
io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SETXATTR);
+ UPDATE_PROFILE_STATS (frame, SETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2275,7 +2555,8 @@ int
io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, GETXATTR);
+ UPDATE_PROFILE_STATS (frame, GETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2285,7 +2566,8 @@ int
io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, REMOVEXATTR);
+ UPDATE_PROFILE_STATS (frame, REMOVEXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2294,7 +2576,8 @@ int
io_stats_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSETXATTR);
+ UPDATE_PROFILE_STATS (frame, FSETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2304,7 +2587,8 @@ int
io_stats_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FGETXATTR);
+ UPDATE_PROFILE_STATS (frame, FGETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2314,7 +2598,8 @@ int
io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FREMOVEXATTR);
+ UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2324,7 +2609,8 @@ int
io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSYNCDIR);
+ UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2334,7 +2620,20 @@ int
io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, ACCESS);
+ struct ios_local *local = frame->local;
+
+ /* ACCESS is called before a READ when a fop fails over
+ * in NFS. We need to make sure that we are attaching the
+ * data correctly to this inode.
+ */
+ if (local->loc.inode && local->loc.path) {
+ attach_iosstat_to_inode (this, local->loc.inode,
+ local->loc.path,
+ local->loc.inode->gfid);
+ }
+
+ UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2345,7 +2644,8 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FTRUNCATE);
+ UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
prebuf, postbuf, xdata);
return 0;
@@ -2356,7 +2656,8 @@ int
io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSTAT);
+ UPDATE_PROFILE_STATS (frame, FSTAT, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2367,8 +2668,9 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, FALLOCATE);
- STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -2379,8 +2681,9 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, DISCARD);
- STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -2390,7 +2693,8 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, ZEROFILL);
+ UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
@@ -2400,7 +2704,8 @@ int
io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, LK);
+ UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);
return 0;
}
@@ -2410,7 +2715,8 @@ int
io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, ENTRYLK);
+ UPDATE_PROFILE_STATS (frame, ENTRYLK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2420,7 +2726,8 @@ int
io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, XATTROP);
+ UPDATE_PROFILE_STATS (frame, XATTROP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2430,7 +2737,8 @@ int
io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FXATTROP);
+ UPDATE_PROFILE_STATS (frame, FXATTROP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2440,7 +2748,8 @@ int
io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, INODELK);
+ UPDATE_PROFILE_STATS (frame, INODELK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2450,6 +2759,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_entrylk_cbk,
@@ -2464,6 +2775,7 @@ int
io_stats_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
@@ -2479,8 +2791,8 @@ int
io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
-
- UPDATE_PROFILE_STATS (frame, FINODELK);
+ UPDATE_PROFILE_STATS (frame, FINODELK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2490,6 +2802,7 @@ int
io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_finodelk_cbk,
@@ -2504,6 +2817,7 @@ int
io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_xattrop_cbk,
@@ -2518,6 +2832,7 @@ int
io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fxattrop_cbk,
@@ -2532,6 +2847,7 @@ int
io_stats_lookup (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_lookup_cbk,
@@ -2545,6 +2861,7 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this,
int
io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_stat_cbk,
@@ -2559,6 +2876,7 @@ int
io_stats_readlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, size_t size, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_readlink_cbk,
@@ -2573,6 +2891,7 @@ int
io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_mknod_cbk,
@@ -2587,9 +2906,7 @@ int
io_stats_mkdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
-
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_mkdir_cbk,
@@ -2604,6 +2921,7 @@ int
io_stats_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_unlink_cbk,
@@ -2618,6 +2936,7 @@ int
io_stats_rmdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, int flags, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_rmdir_cbk,
@@ -2674,6 +2993,7 @@ int
io_stats_setattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_setattr_cbk,
@@ -2688,6 +3008,7 @@ int
io_stats_truncate (call_frame_t *frame, xlator_t *this,
loc_t *loc, off_t offset, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_truncate_cbk,
@@ -2702,8 +3023,8 @@ int
io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t flags, fd_t *fd, dict_t *xdata)
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
+ ios_track_loc (frame, loc);
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
@@ -2719,9 +3040,10 @@ int
io_stats_create (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, mode_t mode,
mode_t umask, fd_t *fd, dict_t *xdata)
+
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
+ ios_track_loc (frame, loc);
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
@@ -2737,8 +3059,7 @@ int
io_stats_readv (call_frame_t *frame, xlator_t *this,
fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
- frame->local = fd;
-
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_readv_cbk,
@@ -2756,9 +3077,12 @@ io_stats_writev (call_frame_t *frame, xlator_t *this,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
int len = 0;
+ struct ios_conf *conf = NULL;
+ struct ios_local *local = NULL;
+ int ret = 0;
+
+ ios_track_fd (frame, fd);
- if (fd->inode)
- frame->local = fd->inode;
len = iov_length (vector, count);
BUMP_WRITE (fd, len);
@@ -2777,6 +3101,7 @@ int
io_stats_statfs (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_statfs_cbk,
@@ -2791,6 +3116,7 @@ int
io_stats_flush (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_flush_cbk,
@@ -2805,6 +3131,7 @@ int
io_stats_fsync (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t flags, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fsync_cbk,
@@ -2971,7 +3298,7 @@ _ios_dump_thread (xlator_t *this) {
stats_filename, strerror(errno));
log_stats_fopen_failure = _gf_false;
}
- samples_logfp = fopen (samples_filename, "w+");
+ samples_logfp = fopen (samples_filename, "a");
if (samples_logfp) {
io_stats_dump_latency_samples_logfp (this,
samples_logfp);
@@ -3024,6 +3351,8 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this,
goto out;
}
+ ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_setxattr_cbk,
@@ -3042,6 +3371,7 @@ int
io_stats_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_getxattr_cbk,
@@ -3056,6 +3386,7 @@ int
io_stats_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_removexattr_cbk,
@@ -3071,6 +3402,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *dict,
int32_t flags, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fsetxattr_cbk,
@@ -3085,6 +3417,7 @@ int
io_stats_fgetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fgetxattr_cbk,
@@ -3099,6 +3432,7 @@ int
io_stats_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fremovexattr_cbk,
@@ -3170,6 +3504,7 @@ int
io_stats_access (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t mask, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_access_cbk,
@@ -3212,6 +3547,7 @@ int
io_stats_fstat (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fstat_cbk,
diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
index 270632bc71b..2eb3a9f9149 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
@@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc,
case RPC_CLNT_DISCONNECT:
case RPC_CLNT_MSG:
case RPC_CLNT_DESTROY:
+ case RPC_CLNT_PING:
break;
}
diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c
index 77637c7beec..459d173db7f 100644
--- a/xlators/features/changelog/src/changelog-ev-handle.c
+++ b/xlators/features/changelog/src/changelog-ev-handle.c
@@ -180,6 +180,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc,
/* Free up mydata */
changelog_rpc_clnt_unref (crpc);
break;
+ case RPC_CLNT_PING:
+ break;
}
return 0;
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index 640c6bb5553..d7c210f24a5 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -234,6 +234,7 @@ blkd:
continue;
bcount++;
+ list_del_init (&ilock->client_list);
list_del_init (&ilock->blocked_locks);
list_add (&ilock->blocked_locks, &released);
}
@@ -268,6 +269,7 @@ granted:
continue;
gcount++;
+ list_del_init (&ilock->client_list);
list_del_init (&ilock->list);
list_add (&ilock->list, &released);
}
@@ -321,6 +323,7 @@ blkd:
bcount++;
+ list_del_init (&elock->client_list);
list_del_init (&elock->blocked_locks);
list_add_tail (&elock->blocked_locks, &released);
}
@@ -355,6 +358,7 @@ granted:
}
gcount++;
+ list_del_init (&elock->client_list);
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index d56a7aca2be..c40c29de63a 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -1116,3 +1116,16 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)
return conf;
}
+
+gf_boolean_t
+pl_does_monkey_want_stuck_lock()
+{
+ long int monkey_unlock_rand = 0;
+ long int monkey_unlock_rand_rem = 0;
+
+ monkey_unlock_rand = random ();
+ monkey_unlock_rand_rem = monkey_unlock_rand % 100;
+ if (monkey_unlock_rand_rem == 0)
+ return _gf_true;
+ return _gf_false;
+}
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 5486f9b8314..3729ca24bed 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -161,4 +161,7 @@ pl_metalock_is_active (pl_inode_t *pl_inode);
int
__pl_queue_lock (pl_inode_t *pl_inode, posix_lock_t *reqlock, int can_block);
+
+gf_boolean_t
+pl_does_monkey_want_stuck_lock();
#endif /* __COMMON_H__ */
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index 783c57e6381..4231d760cdc 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -16,9 +16,9 @@
#include "list.h"
#include "locks.h"
+#include "clear.h"
#include "common.h"
-
void
__pl_entrylk_unref (pl_entry_lock_t *lock)
{
@@ -111,6 +111,97 @@ __conflicting_entrylks (pl_entry_lock_t *l1, pl_entry_lock_t *l2)
return 0;
}
+/* See comments in inodelk.c for details */
+static inline gf_boolean_t
+__stale_entrylk (xlator_t *this, pl_entry_lock_t *candidate_lock,
+ pl_entry_lock_t *requested_lock, time_t *lock_age_sec)
+{
+ posix_locks_private_t *priv = NULL;
+ struct timeval curr;
+ gettimeofday (&curr, NULL);
+
+ priv = this->private;
+
+ /* Question: Should we just prune them all given the
+ * chance? Or just the locks we are attempting to acquire?
+ */
+ if (names_conflict (candidate_lock->basename,
+ requested_lock->basename)) {
+ *lock_age_sec = curr.tv_sec -
+ candidate_lock->granted_time.tv_sec;
+ if (*lock_age_sec > priv->revocation_secs)
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+/* See comments in inodelk.c for details */
+static gf_boolean_t
+__entrylk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_entry_lock_t *lock)
+{
+ posix_locks_private_t *priv = NULL;
+ pl_entry_lock_t *tmp = NULL;
+ pl_entry_lock_t *lk = NULL;
+ gf_boolean_t revoke_lock = _gf_false;
+ int bcount = 0;
+ int gcount = 0;
+ int op_errno = 0;
+ clrlk_args args;
+ args.opts = NULL;
+ time_t lk_age_sec = 0;
+ uint32_t max_blocked = 0;
+ char *reason_str = NULL;
+
+ priv = this->private;
+ args.type = CLRLK_ENTRY;
+ if (priv->revocation_clear_all == _gf_true)
+ args.kind = CLRLK_ALL;
+ else
+ args.kind = CLRLK_GRANTED;
+
+
+ if (list_empty (&dom->entrylk_list))
+ goto out;
+
+ pthread_mutex_lock (&pinode->mutex);
+ lock->pinode = pinode;
+ list_for_each_entry_safe (lk, tmp, &dom->entrylk_list, domain_list) {
+ if (__stale_entrylk (this, lk, lock, &lk_age_sec) == _gf_true) {
+ revoke_lock = _gf_true;
+ reason_str = "age";
+ break;
+ }
+ }
+ max_blocked = priv->revocation_max_blocked;
+ if (max_blocked != 0 && revoke_lock == _gf_false) {
+ list_for_each_entry_safe (lk, tmp, &dom->blocked_entrylks,
+ blocked_locks) {
+ max_blocked--;
+ if (max_blocked == 0) {
+ revoke_lock = _gf_true;
+ reason_str = "max blocked";
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+
+out:
+ if (revoke_lock == _gf_true) {
+ clrlk_clear_entrylk (this, pinode, dom, &args, &bcount, &gcount,
+ &op_errno);
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+ "age: %ld sec] - Entry lock revoked: %d granted & %d "
+ "blocked locks cleared", reason_str,
+ uuid_utoa (pinode->gfid), dom->domain, lk_age_sec,
+ gcount, bcount);
+ }
+
+ return revoke_lock;
+}
+
/**
* entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
@@ -546,6 +637,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
pl_ctx_t *ctx = NULL;
int nonblock = 0;
gf_boolean_t need_inode_unref = _gf_false;
+ posix_locks_private_t *priv = NULL;
+
+ priv = this->private;
if (xdata)
dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
@@ -599,6 +693,24 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
* current stack unwinds.
*/
pinode->inode = inode_ref (inode);
+ if (priv->revocation_secs != 0) {
+ if (cmd != ENTRYLK_UNLOCK) {
+ __entrylk_prune_stale (this, pinode, dom, reqlock);
+ } else if (priv->monkey_unlocking == _gf_true) {
+ if (pl_does_monkey_want_stuck_lock ()) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "MONKEY LOCKING (forcing stuck lock)!");
+ op_ret = 0;
+ need_inode_unref = _gf_true;
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ __pl_entrylk_unref (reqlock);
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+ goto out;
+ }
+ }
+ }
switch (cmd) {
case ENTRYLK_LOCK_NB:
@@ -678,9 +790,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
"a bug report at http://bugs.gluster.com", cmd);
goto out;
}
- if (need_inode_unref)
- inode_unref (pinode->inode);
-
/* The following (extra) unref corresponds to the ref that
* was done at the time the lock was granted.
*/
@@ -689,6 +798,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
out:
+ if (need_inode_unref)
+ inode_unref (pinode->inode);
+
if (unwind) {
entrylk_trace_out (this, frame, volume, fd, loc, basename,
cmd, type, op_ret, op_errno);
@@ -772,8 +884,6 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,
client_list) {
- list_del_init (&l->client_list);
-
pl_entrylk_log_cleanup (l);
pinode = l->pinode;
@@ -810,6 +920,8 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
* blocked to avoid leaving L1 to starve forever.
* iv. unref the object.
*/
+ list_del_init (&l->client_list);
+
if (!list_empty (&l->domain_list)) {
list_del_init (&l->domain_list);
list_add_tail (&l->client_list,
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 1564f26b8fb..e1702c78ba1 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -16,6 +16,7 @@
#include "list.h"
#include "locks.h"
+#include "clear.h"
#include "common.h"
void
@@ -130,6 +131,105 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
inodelk_type_conflict (l1, l2));
}
+/*
+ * Check to see if the candidate lock overlaps/conflicts with the
+ * requested lock. If so, determine how old the lock is and return
+ * true if it exceeds the configured threshold, false otherwise.
+ */
+static inline gf_boolean_t
+__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock,
+ pl_inode_lock_t *requested_lock, time_t *lock_age_sec)
+{
+ posix_locks_private_t *priv = NULL;
+ struct timeval curr;
+
+ priv = this->private;
+ gettimeofday (&curr, NULL);
+ /* Question: Should we just prune them all given the
+ * chance? Or just the locks we are attempting to acquire?
+ */
+ if (inodelk_conflict (candidate_lock, requested_lock)) {
+ *lock_age_sec = curr.tv_sec -
+ candidate_lock->granted_time.tv_sec;
+ if (*lock_age_sec > priv->revocation_secs)
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+/* Examine any locks held on this inode and potentially revoke the lock
+ * if the age exceeds revocation_secs. We will clear _only_ those locks
+ * which are granted, and then grant those locks which are blocked.
+ *
+ * Depending on how this patch works in the wild, we may expand this and
+ * introduce a heuristic which clears blocked locks as well if they
+ * are beyond a threshold.
+ */
+static gf_boolean_t
+__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_inode_lock_t *lock)
+{
+ posix_locks_private_t *priv = NULL;
+ pl_inode_lock_t *tmp = NULL;
+ pl_inode_lock_t *lk = NULL;
+ gf_boolean_t revoke_lock = _gf_false;
+ int bcount = 0;
+ int gcount = 0;
+ int op_errno = 0;
+ clrlk_args args;
+ args.opts = NULL;
+ time_t lk_age_sec = 0;
+ uint32_t max_blocked = 0;
+ char *reason_str = NULL;
+
+ priv = this->private;
+
+ args.type = CLRLK_INODE;
+ if (priv->revocation_clear_all == _gf_true)
+ args.kind = CLRLK_ALL;
+ else
+ args.kind = CLRLK_GRANTED;
+
+ if (list_empty (&dom->inodelk_list))
+ goto out;
+
+ pthread_mutex_lock (&pinode->mutex);
+ list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) {
+ if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) {
+ revoke_lock = _gf_true;
+ reason_str = "age";
+ break;
+ }
+ }
+
+ max_blocked = priv->revocation_max_blocked;
+ if (max_blocked != 0 && revoke_lock == _gf_false) {
+ list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks,
+ blocked_locks) {
+ max_blocked--;
+ if (max_blocked == 0) {
+ revoke_lock = _gf_true;
+ reason_str = "max blocked";
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+
+out:
+ if (revoke_lock == _gf_true) {
+ clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount,
+ &op_errno);
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+ "age: %ld sec] - Inode lock revoked: %d granted & %d "
+ "blocked locks cleared",
+ reason_str, uuid_utoa (pinode->gfid), dom->domain,
+ lk_age_sec, gcount, bcount);
+ }
+ return revoke_lock;
+}
+
/* Determine if lock is grantable or not */
static pl_inode_lock_t *
__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)
@@ -419,8 +519,6 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
client_list) {
- list_del_init (&l->client_list);
-
pl_inodelk_log_cleanup (l);
pl_inode = l->pl_inode;
@@ -458,6 +556,8 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
* forever.
* iv. unref the object.
*/
+ list_del_init (&l->client_list);
+
if (!list_empty (&l->list)) {
__delete_inode_lock (l);
list_add_tail (&l->client_list,
@@ -509,6 +609,7 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
inode_t *inode)
{
+ posix_locks_private_t *priv = NULL;
int ret = -EINVAL;
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
@@ -518,6 +619,8 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
lock->pl_inode = pl_inode;
fl_type = lock->fl_type;
+ priv = this->private;
+
/* Ideally, AFTER a successful lock (both blocking and non-blocking) or
* an unsuccessful blocking lock operation, the inode needs to be ref'd.
*
@@ -537,6 +640,24 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
*/
pl_inode->inode = inode_ref (inode);
+ if (priv->revocation_secs != 0) {
+ if (lock->fl_type != F_UNLCK) {
+ __inodelk_prune_stale (this, pl_inode, dom, lock);
+ } else if (priv->monkey_unlocking == _gf_true) {
+ if (pl_does_monkey_want_stuck_lock ()) {
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ __pl_inodelk_unref (lock);
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
+ inode_unref (pl_inode->inode);
+ gf_log (this->name, GF_LOG_WARNING,
+ "MONKEY LOCKING (forcing stuck lock)!");
+ return 0;
+ }
+ }
+ }
+
if (ctx)
pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index e363f425b65..8eb35da44be 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -190,6 +190,10 @@ typedef struct {
mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */
gf_boolean_t trace; /* trace lock requests in and out */
char *brickname;
+ gf_boolean_t monkey_unlocking;
+ uint32_t revocation_secs;
+ gf_boolean_t revocation_clear_all;
+ uint32_t revocation_max_blocked;
} posix_locks_private_t;
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 3415d59324c..7f85ba4fca5 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -3629,7 +3629,21 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("trace", priv->trace, options, bool, out);
+ GF_OPTION_RECONF ("monkey-unlocking", priv->monkey_unlocking, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("revocation-secs",
+ priv->revocation_secs, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("revocation-clear-all", priv->revocation_clear_all,
+ options, bool, out);
+
+ GF_OPTION_RECONF ("revocation-max-blocked",
+ priv->revocation_max_blocked, options,
+ uint32, out);
ret = 0;
+
out:
return ret;
}
@@ -3680,6 +3694,18 @@ init (xlator_t *this)
GF_OPTION_INIT ("trace", priv->trace, bool, out);
+ GF_OPTION_INIT ("monkey-unlocking", priv->monkey_unlocking,
+ bool, out);
+
+ GF_OPTION_INIT ("revocation-secs", priv->revocation_secs,
+ uint32, out);
+
+ GF_OPTION_INIT ("revocation-clear-all", priv->revocation_clear_all,
+ bool, out);
+
+ GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked,
+ uint32, out);
+
this->local_pool = mem_pool_new (pl_local_t, 32);
if (!this->local_pool) {
ret = -1;
@@ -3936,5 +3962,35 @@ struct volume_options options[] = {
.description = "Trace the different lock requests "
"to logs."
},
+ { .key = { "monkey-unlocking" },
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Ignore a random number of unlock requests. Useful "
+ "for testing/creating robust lock recovery mechanisms."
+ },
+ { .key = {"revocation-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "0",
+ .description = "Maximum time a lock can be taken out, before"
+ "being revoked.",
+ },
+ { .key = {"revocation-clear-all"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "If set to true, will revoke BOTH granted and blocked "
+ "(pending) lock requests if a revocation threshold is "
+ "hit.",
+ },
+ { .key = {"revocation-max-blocked"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "0",
+ .description = "A number of blocked lock requests after which a lock "
+ "will be revoked to allow the others to proceed. Can "
+ "be used in conjunction w/ revocation-clear-all."
+ },
{ .key = {NULL} },
};
diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c
index fc2ff2ab10d..f5062971bf4 100644
--- a/xlators/features/snapview-server/src/snapview-server-mgmt.c
+++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c
@@ -73,7 +73,7 @@ svs_mgmt_init (xlator_t *this)
if (cmd_args->volfile_server)
host = cmd_args->volfile_server;
- ret = rpc_transport_inet_options_build (&options, host, port);
+ ret = rpc_transport_inet_options_build (&options, host, port, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to build the "
"transport options");
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index bf62290d023..3c21b9755ea 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -3418,7 +3418,7 @@ glusterd_transport_keepalive_options_get (int *interval, int *time,
int
glusterd_transport_inet_options_build (dict_t **options, const char *hostname,
- int port)
+ int port, char *addr_family)
{
dict_t *dict = NULL;
int32_t interval = -1;
@@ -3433,7 +3433,8 @@ glusterd_transport_inet_options_build (dict_t **options, const char *hostname,
port = GLUSTERD_DEFAULT_PORT;
/* Build default transport options */
- ret = rpc_transport_inet_options_build (&dict, hostname, port);
+ ret = rpc_transport_inet_options_build (&dict, hostname, port,
+ addr_family);
if (ret)
goto out;
@@ -3470,6 +3471,7 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
int ret = -1;
glusterd_peerctx_t *peerctx = NULL;
data_t *data = NULL;
+ char *addr_family = NULL;
peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t);
if (!peerctx)
@@ -3485,9 +3487,15 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
uniquely identify a
peerinfo */
+ if (dict_get_str(this->options, "transport.address-family",
+ &addr_family)) {
+ addr_family = NULL;
+ }
+
ret = glusterd_transport_inet_options_build (&options,
peerinfo->hostname,
- peerinfo->port);
+ peerinfo->port,
+ addr_family);
if (ret)
goto out;
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
index 0ea66a027bf..4fdff3402f5 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -241,6 +241,50 @@ build_volfile_path (char *volume_id, char *path,
}
+ volid_ptr = strstr (volume_id, "gfproxy-client/");
+ if (volid_ptr) {
+ volid_ptr = strchr (volid_ptr, '/');
+ if (!volid_ptr) {
+ ret = -1;
+ goto out;
+ }
+ volid_ptr++;
+
+ ret = glusterd_volinfo_find (volid_ptr, &volinfo);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't find volinfo");
+ goto out;
+ }
+
+ glusterd_get_gfproxy_client_volfile (volinfo, path, path_len);
+
+ ret = 0;
+ goto out;
+ }
+
+ volid_ptr = strstr (volume_id, "gfproxy/");
+ if (volid_ptr) {
+ volid_ptr = strchr (volid_ptr, '/');
+ if (!volid_ptr) {
+ ret = -1;
+ goto out;
+ }
+ volid_ptr++;
+
+ ret = glusterd_volinfo_find (volid_ptr, &volinfo);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't find volinfo");
+ goto out;
+ }
+
+ glusterd_get_gfproxyd_volfile (volinfo, path, path_len);
+
+ ret = 0;
+ goto out;
+ }
+
volid_ptr = strstr (volume_id, "/snaps/");
if (volid_ptr) {
ret = get_snap_volname_and_volinfo (volid_ptr, &volname,
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index e7ae9b7848d..de5fce5a965 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -10796,6 +10796,45 @@ out:
}
void
+glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len)
+{
+ char workdir[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv);
+
+ switch (volinfo->transport_type) {
+ case GF_TRANSPORT_TCP:
+ snprintf (path, path_len,
+ "%s/trusted-%s.tcp-gfproxy-fuse.vol",
+ workdir, volinfo->volname);
+ break;
+
+ case GF_TRANSPORT_RDMA:
+ snprintf (path, path_len,
+ "%s/trusted-%s.rdma-gfproxy-fuse.vol",
+ workdir, volinfo->volname);
+ break;
+ default:
+ break;
+ }
+}
+
+void
+glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len)
+{
+ char workdir[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv);
+
+ snprintf (path, path_len, "%s/%s.gfproxyd.vol", workdir,
+ volinfo->volname);
+}
+
+void
glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,
char *path, int path_len)
{
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index f4c4138829f..7445407c010 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -642,6 +642,14 @@ void
glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,
char *path, int path_len);
+void
+glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len);
+
+void
+glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len);
+
int32_t
glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo,
glusterd_brickinfo_t *dup_brickinfo);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 25fb23f72b2..2344fd169f1 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -58,6 +58,20 @@ extern struct volopt_map_entry glusterd_volopt_map[];
} \
} while (0 /* CONSTCOND */)
+/**
+ * Needed for GFProxy
+ */
+#define GF_PROXY_DAEMON_PORT 40000
+#define GF_PROXY_DAEMON_PORT_STR "40000"
+
+static int
+volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+ dict_t *set_dict, void *param);
+
+static int
+build_client_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+ dict_t *mod_dict);
+
/*********************************************
*
* xlator generation / graph manipulation API
@@ -1448,6 +1462,75 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
static int
+gfproxy_server_graph_builder (volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ dict_t *set_dict, void *param)
+{
+ xlator_t *xl = NULL;
+ char *value = NULL;
+ char transt[16] = {0, };
+ char key[1024] = {0, };
+ char port_str[7] = {0, };
+ int ret = 0;
+ char *username = NULL;
+ char *password = NULL;
+ int rclusters = 0;
+
+ /* We are a trusted client */
+ ret = dict_set_uint32 (set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+ if (ret != 0)
+ goto out;
+
+ ret = dict_set_str (set_dict, "gfproxy-server", "on");
+ if (ret != 0)
+ goto out;
+
+ /* Build the client section of the graph first */
+ build_client_graph (graph, volinfo, set_dict);
+
+ /* Clear this setting so that future users of set_dict do not end up
+ * thinking they are a gfproxy server */
+ dict_del (set_dict, "gfproxy-server");
+ dict_del (set_dict, "trusted-client");
+
+ /* Then add the server to it */
+ get_vol_transport_type (volinfo, transt);
+ xl = volgen_graph_add (graph, "protocol/server", volinfo->volname);
+ if (!xl)
+ goto out;
+
+ ret = xlator_set_option (xl, "listen-port", GF_PROXY_DAEMON_PORT_STR);
+ if (ret != 0)
+ goto out;
+
+ ret = xlator_set_option (xl, "transport-type", transt);
+ if (ret != 0)
+ goto out;
+
+ /* Set username and password */
+ username = glusterd_auth_get_username (volinfo);
+ password = glusterd_auth_get_password (volinfo);
+ if (username) {
+ snprintf (key, sizeof (key), "auth.login.%s-server.allow",
+ volinfo->volname);
+ ret = xlator_set_option (xl, key, username);
+ if (ret)
+ return -1;
+ }
+
+ if (password) {
+ snprintf (key, sizeof (key), "auth.login.%s.password",
+ username);
+ ret = xlator_set_option (xl, key, password);
+ if (ret != 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int
brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
{
@@ -2541,6 +2624,48 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
}
static int
+gfproxy_server_perfxl_option_handler (volgen_graph_t *graph,
+ struct volopt_map_entry *vme,
+ void *param)
+{
+ gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (param);
+ volinfo = param;
+
+ /* write-behind is the *not* allowed for gfproxy-servers */
+ if (strstr (vme->key, "write-behind")) {
+ return 0;
+ }
+
+ perfxl_option_handler (graph, vme, param);
+
+ return 0;
+}
+
+static int
+gfproxy_client_perfxl_option_handler (volgen_graph_t *graph,
+ struct volopt_map_entry *vme,
+ void *param)
+{
+ gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (param);
+ volinfo = param;
+
+ /* write-behind is the only allowed "perf" for gfproxy-clients */
+ if (!strstr (vme->key, "write-behind"))
+ return 0;
+
+ perfxl_option_handler (graph, vme, param);
+
+ return 0;
+}
+
+
+static int
nfsperfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
void *param)
{
@@ -2768,8 +2893,10 @@ _free_xlator_opt_key (char *key)
}
static xlator_t *
-volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
- char *hostname, char *subvol, char *xl_id,
+volgen_graph_build_client (volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ char *hostname, char *port,
+ char *subvol, char *xl_id,
char *transt, dict_t *set_dict)
{
xlator_t *xl = NULL;
@@ -2801,6 +2928,12 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
goto err;
}
+ if (port) {
+ ret = xlator_set_option (xl, "remote-port", port);
+ if (ret)
+ goto err;
+ }
+
ret = xlator_set_option (xl, "remote-subvolume", subvol);
if (ret)
goto err;
@@ -2824,7 +2957,8 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
ret = dict_get_uint32 (set_dict, "trusted-client",
&client_type);
- if (!ret && client_type == GF_CLIENT_TRUSTED) {
+ if (!ret && (client_type == GF_CLIENT_TRUSTED
+ || client_type == GF_CLIENT_TRUSTED_PROXY)) {
str = NULL;
str = glusterd_auth_get_username (volinfo);
if (str) {
@@ -2911,7 +3045,9 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
i = 0;
cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {
xl = volgen_graph_build_client (graph, volinfo,
- brick->hostname, brick->path,
+ brick->hostname,
+ NULL,
+ brick->path,
brick->brick_id,
transt, set_dict);
if (!xl) {
@@ -3143,8 +3279,9 @@ volgen_graph_build_snapview_client (volgen_graph_t *graph,
get_transport_type (volinfo, set_dict, transt, _gf_false);
- prot_clnt = volgen_graph_build_client (graph, volinfo, NULL, subvol,
- xl_id, transt, set_dict);
+ prot_clnt = volgen_graph_build_client (graph, volinfo,
+ NULL, NULL, subvol,
+ xl_id, transt, set_dict);
if (!prot_clnt) {
ret = -1;
goto out;
@@ -3555,6 +3692,27 @@ static int client_graph_set_perf_options(volgen_graph_t *graph,
{
data_t *tmp_data = NULL;
char *volname = NULL;
+ int ret = 0;
+
+ /*
+ * Logic to make sure gfproxy-client gets custom performance translators
+ */
+ ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0);
+ if (ret == 1) {
+ return volgen_graph_set_options_generic (
+ graph, set_dict, volinfo,
+ &gfproxy_client_perfxl_option_handler);
+ }
+
+ /*
+ * Logic to make sure gfproxy-server gets custom performance translators
+ */
+ ret = dict_get_str_boolean (set_dict, "gfproxy-server", 0);
+ if (ret == 1) {
+ return volgen_graph_set_options_generic (
+ graph, set_dict, volinfo,
+ &gfproxy_server_perfxl_option_handler);
+ }
/*
* Logic to make sure NFS doesn't have performance translators by
@@ -3768,29 +3926,55 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
char *volname = NULL;
glusterd_conf_t *conf = THIS->private;
char *tmp = NULL;
+ char *hostname = NULL;
gf_boolean_t var = _gf_false;
gf_boolean_t ob = _gf_false;
+ gf_boolean_t is_gfproxy = _gf_false;
int uss_enabled = -1;
xlator_t *this = THIS;
+ char *subvol = NULL;
+ size_t subvol_namelen = 0;
GF_ASSERT (this);
GF_ASSERT (conf);
- volname = volinfo->volname;
- ret = volgen_graph_build_clients (graph, volinfo, set_dict,
- param);
- if (ret)
+ ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0);
+ if (ret == -1)
goto out;
- if (volinfo->type == GF_CLUSTER_TYPE_TIER)
- ret = volume_volgen_graph_build_clusters_tier
- (graph, volinfo, _gf_false);
- else
- ret = volume_volgen_graph_build_clusters
- (graph, volinfo, _gf_false);
+ volname = volinfo->volname;
+ if (ret == 0) {
+ ret = volgen_graph_build_clients (graph, volinfo, set_dict,
+ param);
+ if (ret)
+ goto out;
- if (ret == -1)
- goto out;
+ if (volinfo->type == GF_CLUSTER_TYPE_TIER)
+ ret = volume_volgen_graph_build_clusters_tier
+ (graph, volinfo, _gf_false);
+ else
+ ret = volume_volgen_graph_build_clusters
+ (graph, volinfo, _gf_false);
+
+ if (ret == -1)
+ goto out;
+ } else {
+ is_gfproxy = _gf_true;
+ ret = dict_get_str (set_dict,
+ "config.gfproxyd-remote-host", &tmp);
+ if (ret == -1)
+ goto out;
+
+ subvol_namelen = strlen (volinfo->volname) +
+ strlen ("-server") + 1;
+ subvol = alloca (subvol_namelen);
+ snprintf (subvol, subvol_namelen,
+ "%s-server", volinfo->volname);
+
+ volgen_graph_build_client (graph, volinfo, tmp,
+ GF_PROXY_DAEMON_PORT_STR, subvol,
+ "gfproxy", "tcp", set_dict);
+ }
ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false);
if (ret == -1)
@@ -3851,6 +4035,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
+ /* gfproxy needs the AHA translator */
+ if (is_gfproxy) {
+ xl = volgen_graph_add (graph, "cluster/aha", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+
if (conf->op_version == GD_OP_VERSION_MIN) {
ret = glusterd_volinfo_get_boolean (volinfo,
VKEY_FEATURES_QUOTA);
@@ -4731,6 +4924,24 @@ out:
return ret;
}
+static int
+volgen_graph_set_iam_nfsd (const volgen_graph_t *graph)
+{
+ xlator_t *trav;
+ int ret = 0;
+
+ for (trav = first_of ((volgen_graph_t *)graph); trav;
+ trav = trav->next) {
+ if (strcmp (trav->type, "cluster/replicate") != 0)
+ continue;
+
+ ret = xlator_set_option (trav, "iam-nfs-daemon", "yes");
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
/* builds a graph for nfs server role, with option overrides in mod_dict */
int
build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
@@ -4869,6 +5080,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
if (ret)
goto out;
+ ret = volgen_graph_set_iam_nfsd (&cgraph);
+ if (ret)
+ goto out;
+
ret = volgen_graph_merge_sub (graph, &cgraph, 1);
if (ret)
goto out;
@@ -4930,6 +5145,22 @@ get_brick_filepath (char *filename, glusterd_volinfo_t *volinfo,
brickinfo->hostname, brick);
}
+static void
+get_gfproxyd_filepath (char *filename, glusterd_volinfo_t *volinfo)
+{
+ char path[PATH_MAX] = {0, };
+ char brick[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+
+ snprintf (filename, PATH_MAX,
+ "%s/%s.gfproxyd.vol", path,
+ volinfo->volname);
+}
+
gf_boolean_t
glusterd_is_valid_volfpath (char *volname, char *brick)
{
@@ -4975,6 +5206,32 @@ out:
}
static int
+glusterd_generate_gfproxyd_volfile (glusterd_volinfo_t *volinfo)
+{
+ volgen_graph_t graph = {0, };
+ char filename[PATH_MAX] = {0, };
+ int ret = -1;
+
+ GF_ASSERT (volinfo);
+
+ get_gfproxyd_filepath (filename, volinfo);
+
+ struct glusterd_gfproxyd_info info = {
+ .port = GF_PROXY_DAEMON_PORT,
+ };
+
+ ret = build_graph_generic (&graph, volinfo,
+ NULL, &info,
+ &gfproxy_server_graph_builder);
+ if (ret == 0)
+ ret = volgen_write_volfile (&graph, filename);
+
+ volgen_graph_free (&graph);
+
+ return ret;
+}
+
+static int
glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
dict_t *mod_dict, void *data)
@@ -5245,7 +5502,8 @@ glusterd_generate_client_per_brick_volfile (glusterd_volinfo_t *volinfo)
cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {
xl = volgen_graph_build_client (&graph, volinfo,
- brick->hostname, brick->path,
+ brick->hostname,
+ NULL, brick->path,
brick->brick_id,
"tcp", dict);
if (!xl) {
@@ -5376,6 +5634,11 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo,
ret = glusterd_get_trusted_client_filepath (filepath,
volinfo,
type);
+ } else if (client_type == GF_CLIENT_TRUSTED_PROXY) {
+ glusterd_get_gfproxy_client_volfile (volinfo,
+ filepath,
+ PATH_MAX);
+ ret = dict_set_str (dict, "gfproxy-client", "on");
} else {
ret = glusterd_get_client_filepath (filepath,
volinfo,
@@ -5620,6 +5883,7 @@ build_bitd_volume_graph (volgen_graph_t *graph,
xl = volgen_graph_build_client (&cgraph, volinfo,
brickinfo->hostname,
+ NULL,
brickinfo->path,
brickinfo->brick_id,
transt, set_dict);
@@ -5782,6 +6046,7 @@ build_scrub_volume_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
xl = volgen_graph_build_client (&cgraph, volinfo,
brickinfo->hostname,
+ NULL,
brickinfo->path,
brickinfo->brick_id,
transt, set_dict);
@@ -5913,12 +6178,25 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo)
goto out;
}
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED_PROXY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not generate gfproxy client volfiles");
+ goto out;
+ }
+
ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER);
if (ret)
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_VOLFILE_CREATE_FAIL,
"Could not generate client volfiles");
+
+ ret = glusterd_generate_gfproxyd_volfile (volinfo);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not generate gfproxy volfiles");
+
out:
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index f90177372dc..cb2cad50efc 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -52,7 +52,8 @@
typedef enum {
GF_CLIENT_TRUSTED,
- GF_CLIENT_OTHER
+ GF_CLIENT_OTHER,
+ GF_CLIENT_TRUSTED_PROXY,
} glusterd_client_type_t;
struct volgen_graph {
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index bade4ffb06d..61c79655ccf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -286,6 +286,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)
int32_t type = 0;
char *username = NULL;
char *password = NULL;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+#else
+ char *addr_family = "inet";
+#endif
GF_ASSERT (req);
@@ -388,10 +393,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)
/* Setting default as inet for trans_type tcp */
ret = dict_set_dynstr_with_alloc (dict,
"transport.address-family",
- "inet");
+ addr_family);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "failed to set transport.address-family");
+ "failed to set transport.address-family "
+ "to %s", addr_family);
goto out;
}
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 1e24adabe0c..bcb8877c5bd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1048,6 +1048,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 1,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.min-free-strict-mode",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "cluster.min-free-inodes",
.voltype = "cluster/distribute",
.op_version = 1,
@@ -1113,6 +1118,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.flags = OPT_FLAG_CLIENT_OPT,
},
+ { .key = "cluster.du-refresh-interval-sec",
+ .voltype = "cluster/distribute",
+ .option = "du-refresh-interval-sec",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
/* NUFA xlator options (Distribute special case) */
{ .key = "cluster.nufa",
.voltype = "cluster/distribute",
@@ -1461,6 +1473,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "thread-count",
.op_version = 1
},
+ { .key = "performance.io-thread-fops-per-thread-ratio",
+ .voltype = "performance/io-threads",
+ .option = "fops-per-thread-ratio",
+ .op_version = 1
+ },
{ .key = "performance.high-prio-threads",
.voltype = "performance/io-threads",
.op_version = 1
@@ -1555,6 +1572,18 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 2,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "performance.write-behind-trickling-writes",
+ .voltype = "performance/write-behind",
+ .option = "trickling-writes",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.nfs.write-behind-trickling-writes",
+ .voltype = "performance/write-behind",
+ .option = "trickling-writes",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "performance.lazy-open",
.voltype = "performance/open-behind",
.option = "lazy-open",
@@ -2500,6 +2529,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.voltype = "storage/posix",
.op_version = GD_OP_VERSION_3_6_0,
},
+ { .key = "storage.min-free-disk",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
+ { .key = "storage.freespace-check-interval",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
{ .key = "storage.bd-aio",
.voltype = "storage/bd",
.op_version = 3
@@ -2515,6 +2552,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "!config",
.op_version = 2
},
+ { .key = "config.gfproxyd-remote-host",
+ .voltype = "configuration",
+ .option = "gfproxyd-remote-host",
+ .op_version = 2
+ },
{ .key = GLUSTERD_QUORUM_TYPE_KEY,
.voltype = "mgmt/glusterd",
.value = "off",
@@ -2961,7 +3003,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {
{ .key = "cluster.locking-scheme",
.voltype = "cluster/replicate",
.type = DOC,
- .op_version = GD_OP_VERSION_3_7_12,
+ .op_version = GD_OP_VERSION_3_7_12 ,
.flags = OPT_FLAG_CLIENT_OPT
},
{ .key = "cluster.granular-entry-heal",
@@ -2970,6 +3012,72 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_8_0,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .option = "revocation-secs",
+ .key = "features.locks-revocation-secs",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "revocation-clear-all",
+ .key = "features.locks-revocation-clear-all",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "revocation-max-blocked",
+ .key = "features.locks-revocation-max-blocked",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "monkey-unlocking",
+ .key = "features.locks-monkey-unlocking",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ .type = NO_DOC,
+ },
+ { .key = "cluster.halo-enabled",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-hybrid-mode",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-failover-enabled",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-shd-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-nfsd-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-max-replicas",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-min-replicas",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-min-samples",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index bb6af7f378f..4795f958038 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -222,6 +222,11 @@ struct glusterd_brickinfo {
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
+struct glusterd_gfproxyd_info {
+ short port;
+ char *logfile;
+};
+
struct gf_defrag_brickinfo_ {
char *name;
int files;
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 6c4cdfed062..598f62fee7a 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -186,6 +186,25 @@ start_glusterfs ()
fi
#options with values start here
+ if [ -n "$halo_failover_enabled" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-failover-enabled=$halo_failover_enabled");
+ fi
+ if [ -n "$halo_max_latency" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-latency=$halo_max_latency");
+ fi
+
+ if [ -n "$halo_max_replicas" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-replicas=$halo_max_replicas");
+ fi
+
+ if [ -n "$halo_min_replicas" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-min-replicas=$halo_min_replicas");
+ fi
+
if [ -n "$log_level" ]; then
cmd_line=$(echo "$cmd_line --log-level=$log_level");
fi
@@ -479,6 +498,18 @@ with_options()
[ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
fuse_mountopts="${fuse_mountopts}$key=\"$value\""
;;
+ "halo-max-latency")
+ halo_max_latency=$value
+ ;;
+ "halo-max-replicas")
+ halo_max_replicas=$value
+ ;;
+ "halo-min-replicas")
+ halo_min_replicas=$value
+ ;;
+ "halo-failover-enabled")
+ halo_failover_enabled=$value
+ ;;
x-*)
# comments or userspace application-specific options, drop them
;;
diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h
index bc9af2f0b8b..0079b9a3deb 100644
--- a/xlators/nfs/server/src/exports.h
+++ b/xlators/nfs/server/src/exports.h
@@ -22,7 +22,7 @@
#define GF_EXP GF_NFS"-exports"
#define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())"
-#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)"
+#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)"
#define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)"
#define NETGROUP_MAX_LEN 128
diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c
index 48b719d29aa..bff7e0669ff 100644
--- a/xlators/nfs/server/src/mount3.c
+++ b/xlators/nfs/server/src/mount3.c
@@ -1896,7 +1896,7 @@ _mnt3_get_host_from_peer (const char *peer_addr)
size_t host_len = 0;
char *colon = NULL;
- colon = strchr (peer_addr, ':');
+ colon = strrchr (peer_addr, ':');
if (!colon) {
gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER,
"Bad peer %s", peer_addr);
@@ -4123,6 +4123,15 @@ mnt1svc_init (xlator_t *nfsx)
}
}
+#ifdef IPV6_DEFAULT
+ ret = dict_set_str (options, "transport.address-family", "inet6");
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "dict_set_str error when trying to enable ipv6");
+ goto err;
+ }
+#endif
+
ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_msg (GF_NFS, GF_LOG_ERROR, errno,
diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c
index e8e226e953e..536a45ede3d 100644
--- a/xlators/nfs/server/src/mount3udp_svc.c
+++ b/xlators/nfs/server/src/mount3udp_svc.c
@@ -133,7 +133,15 @@ mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp)
mountres3 *res = NULL;
struct sockaddr_in *sin = NULL;
- sin = svc_getcaller (transp);
+ sin = (struct sockaddr_in *)svc_getcaller (transp);
+ /* svc_getcaller returns a pointer to a sockaddr_in6, even though it
+ * might actually be an IPv4 address. It ought return a struct sockaddr
+ * and make the caller upcast it to the proper address family. Sigh.
+ *
+ * Let's make sure that it's actually an IPv4 address.
+ */
+ GF_ASSERT (sin->sin_family == AF_INET);
+
inet_ntop (AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN+1);
switch (rqstp->rq_proc) {
diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c
index af37f6b264c..a39a0e6ee3a 100644
--- a/xlators/nfs/server/src/nfs-common.c
+++ b/xlators/nfs/server/src/nfs-common.c
@@ -138,8 +138,12 @@ nfs_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)
gf_uuid_copy (loc->gfid, inode->gfid);
}
- if (parent)
+ if (parent) {
loc->parent = inode_ref (parent);
+ if (!gf_uuid_is_null (parent->gfid)) {
+ gf_uuid_copy (loc->pargfid, parent->gfid);
+ }
+ }
if (path) {
loc->path = gf_strdup (path);
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index ddfa89dab11..d5087f195ca 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -204,6 +204,9 @@ nfs_program_register_portmap_all (struct nfs_state *nfs)
if (nfs->override_portnum)
prog->progport = nfs->override_portnum;
(void) rpcsvc_program_register_portmap (prog, prog->progport);
+#ifdef IPV6_DEFAULT
+ (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport);
+#endif
}
return (0);
@@ -339,6 +342,17 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
if (version->required)
goto err;
}
+#ifdef IPV6_DEFAULT
+ ret = rpcsvc_program_register_rpcbind6 (prog,
+ prog->progport);
+ if (ret == -1) {
+ gf_msg (GF_NFS, GF_LOG_ERROR, 0,
+ NFS_MSG_PGM_REG_FAIL,
+ "Program (ipv6) %s registration failed",
+ prog->progname);
+ goto err;
+ }
+#endif
}
}
@@ -901,6 +915,16 @@ nfs_init_state (xlator_t *this)
}
}
+#ifdef IPV6_DEFAULT
+ ret = dict_set_str (this->options, "transport.address-family",
+ "inet6");
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error");
+ goto free_foppool;
+ }
+#endif
+
+
/* Right only socket support exists between nfs client and
* gluster nfs, so we can set default value as socket
*/
@@ -2019,7 +2043,7 @@ struct volume_options options[] = {
},
{ .key = {"nfs.mount-rmtab"},
.type = GF_OPTION_TYPE_PATH,
- .default_value = NFS_DATADIR "/rmtab",
+ .default_value = "/-",
.description = "Set the location of the cache file that is used to "
"list all the NFS-clients that have connected "
"through the MOUNT protocol. If this is on shared "
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
index 64287c5b1bd..5aa9ea4e76e 100644
--- a/xlators/nfs/server/src/nfs3.c
+++ b/xlators/nfs/server/src/nfs3.c
@@ -372,6 +372,28 @@ out:
} while (0) \
+/*
+ * This macro checks if the volume is started or not.
+ * If it is not started, it closes the client connection & logs it.
+ *
+ * Why do we do this?
+ *
+ * There is a "race condition" where gNFSd may start listening for RPC requests
+ * prior to the volume being started. Presumably, that is why this macro exists
+ * in the first place. In the NFS kernel client (specifically Linux's NFS
+ * kernel client), they establish a TCP connection to our endpoint and
+ * (re-)send requests. If we ignore the request, and return nothing back,
+ * the NFS kernel client waits forever for our response. If for some reason,
+ * the TCP connection were to die, and re-establish, the requests are
+ * retransmitted and everything begins working as expected
+ *
+ * Now, this is clearly bad behavior on the client side,
+ * but in order to make every user's life easier,
+ * gNFSd should simply disconnect the TCP connection if it sees requests
+ * before it is ready to accept them.
+ *
+ */
+
#define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl) \
do { \
if ((!nfs_subvolume_started (nfs_state (nf3stt->nfsx), vlm))){\
@@ -379,11 +401,32 @@ out:
NFS_MSG_VOL_DISABLE, \
"Volume is disabled: %s", \
vlm->name); \
+ nfs3_disconnect_transport (req->trans); \
rtval = RPCSVC_ACTOR_IGNORE; \
goto erlbl; \
} \
} while (0) \
+void
+nfs3_disconnect_transport (rpc_transport_t *transport)
+{
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO (GF_NFS3, transport, out);
+
+ ret = rpc_transport_disconnect (transport);
+ if (ret != 0) {
+ gf_log (GF_NFS3, GF_LOG_WARNING,
+ "Unable to close client connection to %s.",
+ transport->peerinfo.identifier);
+ } else {
+ gf_log (GF_NFS3, GF_LOG_WARNING,
+ "Closed client connection to %s.",
+ transport->peerinfo.identifier);
+ }
+out:
+ return;
+}
int
nfs3_export_sync_trusted (struct nfs3_state *nfs3, uuid_t exportid)
@@ -778,6 +821,12 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cs = frame->local;
if (op_ret == -1) {
+ /* Prevent crashes for the case where this call fails
+ * and buf is left in a NULL state, yet the op_errno == 0.
+ */
+ if (!buf && op_errno == 0) {
+ op_errno = EIO;
+ }
status = nfs3_cbk_errno_status (op_ret, op_errno);
}
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index c81a97d8a39..5ab38890df3 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -162,8 +162,6 @@ iot_worker (void *data)
THIS = this;
for (;;) {
- sleep_till.tv_sec = time (NULL) + conf->idle_time;
-
pthread_mutex_lock (&conf->mutex);
{
if (pri != -1) {
@@ -171,8 +169,11 @@ iot_worker (void *data)
pri = -1;
}
while (conf->queue_size == 0) {
- conf->sleep_count++;
+ clock_gettime (CLOCK_REALTIME_COARSE,
+ &sleep_till);
+ sleep_till.tv_sec += conf->idle_time;
+ conf->sleep_count++;
ret = pthread_cond_timedwait (&conf->cond,
&conf->mutex,
&sleep_till);
@@ -202,7 +203,7 @@ iot_worker (void *data)
&conf->mutex, &sleep);
pthread_mutex_unlock(&conf->mutex);
continue;
- }
+ }
}
pthread_mutex_unlock (&conf->mutex);
@@ -228,14 +229,25 @@ int
do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri)
{
int ret = 0;
+ int active_count = 0;
pthread_mutex_lock (&conf->mutex);
{
__iot_enqueue (conf, stub, pri);
- pthread_cond_signal (&conf->cond);
-
- ret = __iot_workers_scale (conf);
+ /* If we have an ample supply of threads alive already
+ * it's massively more efficient to keep the ones you have
+ * busy vs making new ones and signaling everyone
+ */
+ active_count = conf->curr_count - conf->sleep_count;
+ if (conf->fops_per_thread_ratio == 0 || active_count == 0 ||
+ (conf->queue_size/active_count >
+ conf->fops_per_thread_ratio &&
+ active_count < conf->max_count)) {
+ pthread_cond_signal (&conf->cond);
+
+ ret = __iot_workers_scale (conf);
+ }
}
pthread_mutex_unlock (&conf->mutex);
@@ -900,6 +912,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out);
+ GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+ options, int32, out);
+
GF_OPTION_RECONF ("high-prio-threads",
conf->ac_iot_limit[IOT_PRI_HI], options, int32, out);
@@ -972,6 +987,9 @@ init (xlator_t *this)
GF_OPTION_INIT ("thread-count", conf->max_count, int32, out);
+ GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+ int32, out);
+
GF_OPTION_INIT ("high-prio-threads",
conf->ac_iot_limit[IOT_PRI_HI], int32, out);
@@ -1096,6 +1114,20 @@ struct volume_options options[] = {
"perform concurrent IO operations"
},
+ { .key = {"fops-per-thread-ratio"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = IOT_MIN_FOP_PER_THREAD,
+ .max = IOT_MAX_FOP_PER_THREAD,
+ .default_value = "20",
+ .description = "The optimal ratio of threads to FOPs in the queue "
+ "we wish to achieve before creating a new thread. "
+ "The idea here is it's far cheaper to keep our "
+ "currently running threads busy than spin up "
+ "new threads or cause a stampeding herd of threads "
+ "to service a singlular FOP when you have a thread "
+ "which will momentarily become available to do the "
+ "work."
+ },
{ .key = {"high-prio-threads"},
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index d8eea2cf77a..e5c97f690a2 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -34,7 +34,9 @@ struct iot_conf;
#define IOT_MIN_THREADS 1
#define IOT_DEFAULT_THREADS 16
-#define IOT_MAX_THREADS 64
+#define IOT_MAX_THREADS 256
+#define IOT_MIN_FOP_PER_THREAD 0
+#define IOT_MAX_FOP_PER_THREAD 2000
#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024))
@@ -62,6 +64,7 @@ struct iot_conf {
pthread_cond_t cond;
int32_t max_count; /* configured maximum */
+ int32_t fops_per_thread_ratio;
int32_t curr_count; /* actual number of threads running */
int32_t sleep_count;
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index 30443761c56..c3baafdc1b6 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -33,6 +33,7 @@ struct mdc_conf {
gf_boolean_t cache_selinux;
gf_boolean_t force_readdirp;
gf_boolean_t cache_swift_metadata;
+ gf_boolean_t cache_all_xattrs;
};
@@ -792,6 +793,7 @@ struct checkpair {
static int
is_mdc_key_satisfied (const char *key)
{
+ unsigned int checked_keys = 0;
const char *mdc_key = NULL;
int i = 0;
@@ -801,11 +803,13 @@ is_mdc_key_satisfied (const char *key)
for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {
if (!mdc_keys[i].load)
continue;
+
+ checked_keys++;
if (strcmp (mdc_key, key) == 0)
return 1;
}
- return 0;
+ return 0;
}
@@ -875,7 +879,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xattr_rsp = NULL;
dict_t *xattr_alloc = NULL;
mdc_local_t *local = NULL;
-
+ struct mdc_conf *conf = this->private;
local = mdc_local_get (frame);
if (!local)
@@ -899,10 +903,17 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
if (ret != 0)
goto uncached;
- if (!mdc_xattr_satisfied (this, xdata, xattr_rsp))
+ /* Only check the keys if we are not caching all the xattrs */
+ if (!conf->cache_all_xattrs &&
+ !mdc_xattr_satisfied (this, xdata, xattr_rsp)) {
goto uncached;
+ }
}
+ gf_msg (this->name, GF_LOG_TRACE, 0, 0,
+ "Returning lookup from cache for gfid %s",
+ uuid_utoa(loc->inode->gfid));
+
MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf,
xattr_rsp, &postparent);
@@ -1882,6 +1893,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
int op_errno = ENODATA;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
+ struct mdc_conf *conf = this->private;
local = mdc_local_get (frame);
if (!local)
@@ -1897,7 +1909,18 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
goto uncached;
if (!xattr || !dict_get (xattr, (char *)key)) {
- ret = -1;
+ /* If we can't find the extended attribute, & cache-all-xattrs
+ * is enabled, we should wind and try to find them.
+ *
+ * NOTE: Quota & AFR queries through the mount
+ * (i.e, virtual Gluster xattrs)
+ * won't work unless we do this.
+ */
+ if (conf->cache_all_xattrs) {
+ goto uncached;
+ }
+
+ ret = -1;
op_errno = ENODATA;
}
@@ -2363,7 +2386,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out);
-
+ GF_OPTION_RECONF("cache-all-xattrs", conf->cache_all_xattrs, options,
+ bool, out);
out:
return 0;
}
@@ -2404,6 +2428,7 @@ init (xlator_t *this)
conf->cache_swift_metadata);
GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out);
+ GF_OPTION_INIT ("cache-all-xattrs", conf->cache_all_xattrs, bool, out);
out:
this->private = conf;
@@ -2474,7 +2499,7 @@ struct volume_options options[] = {
{ .key = {"md-cache-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .max = 60,
+ .max = 300,
.default_value = "1",
.description = "Time period after which cache has to be refreshed",
},
@@ -2484,5 +2509,19 @@ struct volume_options options[] = {
.description = "Convert all readdir requests to readdirplus to "
"collect stat info on each entry.",
},
+ { .key = {"strict-xattrs"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .description = "When reading extended attributes from the cache, "
+ "if an xattr is not found, attempt to find it by winding "
+ "instead of returning ENODATA. This is necessary to query "
+ "the special extended attributes (trusted.glusterfs.quota.size) "
+ "through a FUSE mount with md-cache enabled."
+ },
+ { .key = {"cache-all-xattrs"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Cache all the extended attributes for an inode.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 7f5719b1e48..bc59036ff88 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -169,6 +169,7 @@ typedef struct wb_request {
typedef struct wb_conf {
uint64_t aggregate_size;
+ uint64_t page_size;
uint64_t window_size;
gf_boolean_t flush_behind;
gf_boolean_t trickling_writes;
@@ -1207,18 +1208,21 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
char *ptr = NULL;
struct iobuf *iobuf = NULL;
struct iobref *iobref = NULL;
+ struct wb_conf *conf = NULL;
int ret = -1;
ssize_t required_size = 0;
size_t holder_len = 0;
size_t req_len = 0;
+ conf = req->wb_inode->this->private;
+
if (!holder->iobref) {
holder_len = iov_length (holder->stub->args.vector,
holder->stub->args.count);
req_len = iov_length (req->stub->args.vector,
req->stub->args.count);
- required_size = max ((THIS->ctx->page_size),
+ required_size = max ((conf->page_size),
(holder_len + req_len));
iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool,
required_size);
@@ -1281,7 +1285,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
wb_request_t *holder = NULL;
wb_conf_t *conf = NULL;
int ret = 0;
- ssize_t page_size = 0;
/* With asynchronous IO from a VM guest (as a file), there
can be two sequential writes happening in two regions
@@ -1292,7 +1295,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
through the interleaved ops
*/
- page_size = wb_inode->this->ctx->page_size;
conf = wb_inode->this->private;
list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) {
@@ -1343,7 +1345,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
- space_left = page_size - holder->write_size;
+ space_left = wb_inode->window_conf - holder->write_size;
if (space_left < req->write_size) {
holder->ordering.go = 1;
@@ -2471,6 +2473,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64,
out);
+ GF_OPTION_RECONF ("cache-size", conf->page_size, options, size_uint64,
+ out);
+
GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,
out);
@@ -2522,6 +2527,7 @@ init (xlator_t *this)
/* configure 'option window-size <size>' */
GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out);
+ GF_OPTION_INIT ("cache-size", conf->page_size, size_uint64, out);
if (!conf->window_size && conf->aggregate_size) {
gf_msg (this->name, GF_LOG_WARNING, 0,
diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c
index dc6e244e717..7732a9711ae 100644
--- a/xlators/protocol/client/src/client-handshake.c
+++ b/xlators/protocol/client/src/client-handshake.c
@@ -15,6 +15,7 @@
#include "glusterfs.h"
#include "statedump.h"
#include "compat-errno.h"
+#include "latency.h"
#include "glusterfs3.h"
#include "portmap-xdr.h"
@@ -1549,7 +1550,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi
rpc_clnt_reconfig (conf->rpc, &config);
conf->skip_notify = 1;
- conf->quick_reconnect = 1;
+ conf->quick_reconnect = 1;
out:
if (frame)
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 3cb5e231fbe..3e18b4870ae 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -467,7 +467,7 @@ int32_t
client_forget (xlator_t *this, inode_t *inode)
{
/* Nothing here */
- return 0;
+ return 0;
}
int32_t
@@ -545,7 +545,7 @@ out:
STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -571,7 +571,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -600,7 +600,7 @@ out:
STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -628,7 +628,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -657,7 +657,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -687,7 +687,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -718,7 +718,7 @@ out:
STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -748,7 +748,7 @@ out:
STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -778,7 +778,7 @@ out:
STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN,
NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -807,7 +807,7 @@ out:
STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN,
NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -837,7 +837,7 @@ out:
STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -867,7 +867,7 @@ out:
STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -897,7 +897,7 @@ out:
STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -932,7 +932,7 @@ out:
STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -965,7 +965,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1000,7 +1000,7 @@ out:
STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN,
NULL, 0, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1038,7 +1038,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1064,7 +1064,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1093,7 +1093,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1120,7 +1120,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1149,7 +1149,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1177,7 +1177,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1204,7 +1204,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
static gf_boolean_t
@@ -1393,7 +1393,7 @@ out:
if (need_unwind)
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL);
- return 0;
+ return 0;
}
@@ -1423,7 +1423,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1453,7 +1453,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1482,7 +1482,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1512,7 +1512,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1542,7 +1542,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1571,7 +1571,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1598,7 +1598,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1654,7 +1654,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1684,7 +1684,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1715,7 +1715,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1747,7 +1747,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1780,7 +1780,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1809,7 +1809,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1840,7 +1840,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1872,7 +1872,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1901,7 +1901,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1929,7 +1929,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -2155,7 +2155,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL);
- return 0;
+ return 0;
}
@@ -2227,6 +2227,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf = this->private;
switch (event) {
+ case RPC_CLNT_PING:
+ {
+ ret = default_notify (this, GF_EVENT_CHILD_PING, NULL);
+ if (ret)
+ gf_log (this->name, GF_LOG_INFO,
+ "CHILD_PING notify failed");
+ conf->last_sent_event = GF_EVENT_CHILD_PING;
+ break;
+ }
case RPC_CLNT_CONNECT:
{
conf->connected = 1;
@@ -2312,13 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf->connected = 0;
conf->skip_notify = 0;
- if (conf->quick_reconnect) {
- conf->quick_reconnect = 0;
- rpc_clnt_start (rpc);
-
- } else {
+ if (conf->rpc->conn.connected) {
+ /* Having conf->connected false and
+ * conf->rpc->conn.connected true is an
+ * unrecoverable state, since rpc_clnt_reconnect
+ * will do nothing for an already connected connection.
+ * A good fix would be to ensure serialized
+ * delivery of transport messages, but that is super hard
+ * and this is rare. So... ghetto "fix", disconnect the
+ * RPC and start the race again. Maybe we'll win
+ * next time!
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "Client %s reconnect race detected, "
+ "restarting.", conf->rpc->conn.name);
+ conf->quick_reconnect = 1;
+ rpc_transport_disconnect (rpc->conn.trans);
rpc->conn.config.remote_port = 0;
-
+ } else {
+ if (conf->quick_reconnect) {
+ conf->quick_reconnect = 0;
+ rpc_clnt_start (rpc);
+ } else {
+ rpc->conn.config.remote_port = 0;
+ }
}
break;
@@ -2670,7 +2696,7 @@ reconfigure (xlator_t *this, dict_t *options)
ret = 0;
out:
- return ret;
+ return ret;
}
@@ -2724,6 +2750,8 @@ init (xlator_t *this)
this->private = conf;
+ this->client_latency.min = UINT64_MAX;
+
/* If it returns -1, then its a failure, if it returns +1 we need
have to understand that 'this' is subvolume of a xlator which,
will set the remote host and remote subvolume in a setxattr
@@ -3001,7 +3029,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_TIME,
.min = 0,
.max = 1013,
- .default_value = "42",
+ .default_value = "180",
.description = "Time duration for which the client waits to "
"check if the server is responsive."
},
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index d8ef5f7b73f..636108affbb 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ goto err;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL,
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index cecf5dcb66d..c40a087ec46 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -658,6 +658,81 @@ out:
return 0;
}
+static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats,
+ double min_free_disk,
+ gf_boolean_t previously_ok)
+{
+ gf_boolean_t currently_ok;
+
+ if (min_free_disk < 100.0) {
+ double free_percent = 100.0 * stats->f_bavail / stats->f_blocks;
+
+ currently_ok =
+ free_percent >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free percent "
+ "%f%% < %f%%. Writes disabled.",
+ free_percent, min_free_disk);
+ }
+ } else {
+ double free_bytes = stats->f_bavail * stats->f_frsize;
+
+ currently_ok =
+ free_bytes >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free bytes %f "
+ "< %f. Writes disabled.",
+ free_bytes, min_free_disk);
+ }
+ }
+
+ if (currently_ok && !previously_ok) {
+ gf_log (this->name, GF_LOG_INFO, "Free space has risen above "
+ "min-free-disk limit, writes "
+ "re-enabled.");
+ }
+
+ return currently_ok;
+}
+
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv)
+{
+ /* Check if there is sufficient free space to allow writes.
+ *
+ * This is called in the write path, so performance matters. We
+ * periodically sample free space by calling statvfs().
+ * freespace_check_lock is used to ensure only one process at a
+ * time makes the call; if the lock is contended, the previous
+ * status (reflected in freespace_check_passed) is used while
+ * the process that holds the mutex updates the current status.
+ */
+ if (!priv->freespace_check_interval) {
+ return _gf_true;
+ }
+
+ if (!pthread_mutex_trylock (&priv->freespace_check_lock)) {
+ struct timespec now;
+
+ clock_gettime (CLOCK_MONOTONIC, &now);
+ if (now.tv_sec >= priv->freespace_check_last.tv_sec +
+ priv->freespace_check_interval) {
+ sys_statvfs (priv->base_path, &priv->freespace_stats);
+ priv->freespace_check_last.tv_sec = now.tv_sec;
+
+ priv->freespace_check_passed = freespace_ok (
+ this, &priv->freespace_stats, priv->min_free_disk,
+ priv->freespace_check_passed);
+ }
+
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+ }
+
+ return priv->freespace_check_passed;
+}
+
static int32_t
posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t flags, off_t offset, size_t len,
@@ -667,6 +742,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_errno = 0;
struct posix_fd *pfd = NULL;
gf_boolean_t locked = _gf_false;
+ struct posix_private *priv = this->private;
DECLARE_OLD_FS_ID_VAR;
@@ -675,6 +751,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ if (!posix_write_ok (this, priv)) {
+ ret = -ENOSPC;
+ goto out;
+ }
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
@@ -3307,6 +3389,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (priv, out);
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ op_ret = -1;
+ goto out;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL,
@@ -6671,6 +6759,16 @@ struct posix_private *priv = NULL;
options, uint32, out);
posix_spawn_health_check_thread (this);
+ pthread_mutex_lock (&priv->freespace_check_lock);
+ {
+ GF_OPTION_RECONF ("freespace-check-interval",
+ priv->freespace_check_interval,
+ options, uint32, out);
+ GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options,
+ percent_or_size, out);
+ }
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+
ret = 0;
out:
return ret;
@@ -7285,6 +7383,19 @@ init (xlator_t *this)
GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
uint32, out);
+
+ GF_OPTION_INIT ("freespace-check-interval",
+ _private->freespace_check_interval, uint32, out);
+
+ GF_OPTION_INIT ("min-free-disk", _private->min_free_disk,
+ percent_or_size, out);
+
+ pthread_mutex_init (&_private->freespace_check_lock, NULL);
+ sys_statvfs (_private->base_path, &_private->freespace_stats);
+ clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last);
+ _private->freespace_check_passed = freespace_ok (
+ this, &_private->freespace_stats, _private->min_free_disk,
+ _gf_true);
out:
return ret;
}
@@ -7462,5 +7573,22 @@ struct volume_options options[] = {
"\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n"
},
#endif
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "2%",
+ .description = "Minimum percentage/size of disk space, after which we"
+ "start failing writes with ENOSPC."
+ },
+ {
+ .key = {"freespace-check-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "5",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Interval in seconds between freespace measurements "
+ "used for the min-free-disk determination. "
+ "Set to 0 to disable."
+ },
+
{ .key = {NULL} }
};
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 87f91e57747..ef4bc66ecbc 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -174,7 +174,14 @@ struct posix_private {
XATTR_BOTH,
} xattr_user_namespace;
#endif
-
+ /* freespace_check_lock protects access to following three fields. */
+ pthread_mutex_t freespace_check_lock;
+ struct timespec freespace_check_last;
+ struct statvfs freespace_stats;
+ double min_free_disk;
+ /* mutex protection ends. */
+ uint32_t freespace_check_interval;
+ gf_boolean_t freespace_check_passed;
};
typedef struct {
@@ -263,6 +270,9 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode,
void
posix_gfid_unset (xlator_t *this, dict_t *xdata);
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv);
+
int
posix_pacl_set (const char *path, const char *key, const char *acl_s);