summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile.am1
-rw-r--r--api/src/glfs-mgmt.c3
-rwxr-xr-xbuild.sh24
-rw-r--r--build_env55
-rw-r--r--cli/src/cli.c10
-rw-r--r--configure.ac100
-rwxr-xr-xfb-smoke.sh190
-rw-r--r--glusterfs.spec.in21
-rw-r--r--glusterfsd/src/glusterfsd-mgmt.c77
-rw-r--r--glusterfsd/src/glusterfsd.c10
-rw-r--r--glusterfsd/src/glusterfsd.h2
-rw-r--r--libglusterfs/src/client_t.c2
-rw-r--r--libglusterfs/src/common-utils.c28
-rw-r--r--libglusterfs/src/common-utils.h2
-rw-r--r--libglusterfs/src/compat.h8
-rw-r--r--libglusterfs/src/dict.c95
-rw-r--r--libglusterfs/src/dict.h11
-rw-r--r--libglusterfs/src/glusterfs.h47
-rw-r--r--libglusterfs/src/iobuf.c4
-rw-r--r--libglusterfs/src/latency.c19
-rw-r--r--libglusterfs/src/mem-pool.c12
-rw-r--r--libglusterfs/src/mem-types.h1
-rw-r--r--libglusterfs/src/timespec.c12
-rw-r--r--libglusterfs/src/timespec.h3
-rw-r--r--libglusterfs/src/xlator.c16
-rw-r--r--libglusterfs/src/xlator.h1
-rwxr-xr-xrfc.sh2
-rw-r--r--rpc/rpc-lib/src/rpc-clnt-ping.c87
-rw-r--r--rpc/rpc-lib/src/rpc-clnt-ping.h2
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.c29
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.h1
-rw-r--r--rpc/rpc-lib/src/rpc-transport.c44
-rw-r--r--rpc/rpc-lib/src/rpc-transport.h3
-rw-r--r--rpc/rpc-lib/src/rpcsvc.c99
-rw-r--r--rpc/rpc-lib/src/rpcsvc.h7
-rw-r--r--rpc/rpc-lib/src/xdr-common.h7
-rw-r--r--rpc/rpc-transport/rdma/src/name.c5
-rw-r--r--rpc/rpc-transport/socket/src/name.c18
-rw-r--r--rpc/rpc-transport/socket/src/socket.c22
-rw-r--r--rpc/xdr/src/glusterfs-fops.x1
-rwxr-xr-xrun-tests.sh6
-rw-r--r--site.h.in27
-rw-r--r--test_env165
-rwxr-xr-xtests/basic/accept-v6v4.t148
-rw-r--r--tests/basic/afr/gfid-unsplit-shd.t98
-rw-r--r--tests/basic/afr/gfid-unsplit-type-mismatch.t86
-rw-r--r--tests/basic/afr/gfid-unsplit.t120
-rw-r--r--tests/basic/afr/metadata-self-heal.t1
-rw-r--r--tests/basic/afr/self-heal.t15
-rw-r--r--tests/basic/afr/shd-autofix-nogfid.t68
-rw-r--r--tests/basic/afr/shd-force-inspect.t61
-rw-r--r--tests/basic/afr/shd-pgfid-heal.t81
-rwxr-xr-xtests/basic/bd.t1
-rw-r--r--tests/basic/cache.t69
-rwxr-xr-xtests/basic/dht-min-free-space.t69
-rw-r--r--tests/basic/ec/ec-common2
-rw-r--r--tests/basic/ec/self-heal.t2
-rw-r--r--tests/basic/exports_parsing.t15
-rw-r--r--tests/basic/fop-sampling.t78
-rwxr-xr-xtests/basic/fops-sanity-gfproxy.t32
-rw-r--r--tests/basic/gfid-access.t1
-rw-r--r--tests/basic/gfproxy.t74
-rw-r--r--tests/basic/glusterd/volfile_server_switch.t3
-rw-r--r--tests/basic/halo-failover-disabled.t77
-rw-r--r--tests/basic/halo-failover-enabled.t85
-rw-r--r--tests/basic/halo-hybrid.t70
-rw-r--r--tests/basic/halo.t51
-rwxr-xr-xtests/basic/mount-nfs-auth.t107
-rw-r--r--tests/basic/pgfid-feat.t1
-rwxr-xr-xtests/basic/quota-anon-fd-nfs.t1
-rwxr-xr-xtests/basic/quota.t1
-rw-r--r--[-rwxr-xr-x]tests/basic/rpc-coverage.t1
-rw-r--r--tests/basic/stats-dump.t5
-rw-r--r--tests/basic/uss.t2
-rw-r--r--tests/basic/write-behind.t53
-rw-r--r--tests/bugs/distribute/bug-1099890.t2
-rwxr-xr-xtests/bugs/distribute/bug-1161311.t10
-rwxr-xr-xtests/bugs/fb4482137.t65
-rw-r--r--tests/bugs/fb8149516.t40
-rw-r--r--tests/bugs/fuse/bug-858488-min-free-disk.t1
-rw-r--r--tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t22
-rwxr-xr-xtests/bugs/glusterd/bug-859927.t8
-rwxr-xr-xtests/bugs/nfs/bug-1166862.t4
-rwxr-xr-xtests/bugs/nfs/bug-904065.t4
-rw-r--r--tests/bugs/quota/bug-1292020.t7
-rwxr-xr-xtests/bugs/replicate/bug-859581.t2
-rw-r--r--tests/cluster.rc9
-rw-r--r--tests/configfiles/exports-v61
-rw-r--r--tests/env.rc.in3
-rwxr-xr-xtests/features/brick-min-free-space.t121
-rw-r--r--tests/features/lock_revocation.t52
-rw-r--r--tests/halo.rc52
-rw-r--r--tests/include.rc19
-rw-r--r--tests/nfs.rc2
-rw-r--r--tests/volume.rc7
-rw-r--r--xlators/cluster/Makefile.am2
-rw-r--r--xlators/cluster/afr/src/afr-common.c762
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c9
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c228
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c6
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c144
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c11
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c383
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h1
-rw-r--r--xlators/cluster/afr/src/afr.c185
-rw-r--r--xlators/cluster/afr/src/afr.h37
-rw-r--r--xlators/cluster/aha/Makefile.am3
-rw-r--r--xlators/cluster/aha/src/Makefile.am18
-rw-r--r--xlators/cluster/aha/src/aha-fops.c952
-rw-r--r--xlators/cluster/aha/src/aha-fops.h360
-rw-r--r--xlators/cluster/aha/src/aha-helpers.c46
-rw-r--r--xlators/cluster/aha/src/aha-helpers.h23
-rw-r--r--xlators/cluster/aha/src/aha-mem-types.h22
-rw-r--r--xlators/cluster/aha/src/aha-retry.c524
-rw-r--r--xlators/cluster/aha/src/aha-retry.h12
-rw-r--r--xlators/cluster/aha/src/aha.c345
-rw-r--r--xlators/cluster/aha/src/aha.h46
-rw-r--r--xlators/cluster/dht/src/dht-common.c51
-rw-r--r--xlators/cluster/dht/src/dht-common.h8
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c53
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c10
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c12
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c24
-rw-r--r--xlators/cluster/dht/src/dht-shared.c32
-rw-r--r--xlators/cluster/dht/src/nufa.c10
-rw-r--r--xlators/cluster/dht/src/switch.c10
-rw-r--r--xlators/debug/io-stats/src/io-stats.c705
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.c1
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c2
-rw-r--r--xlators/features/locks/src/clear.c4
-rw-r--r--xlators/features/locks/src/common.c13
-rw-r--r--xlators/features/locks/src/common.h3
-rw-r--r--xlators/features/locks/src/entrylk.c115
-rw-r--r--xlators/features/locks/src/inodelk.c119
-rw-r--r--xlators/features/locks/src/locks.h4
-rw-r--r--xlators/features/locks/src/posix.c56
-rw-r--r--xlators/features/marker/src/marker.c14
-rw-r--r--xlators/features/quota/src/quota.c2
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c21
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c47
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h8
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c337
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c135
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h5
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in31
-rw-r--r--xlators/nfs/server/src/auth-cache.c542
-rw-r--r--xlators/nfs/server/src/auth-cache.h31
-rw-r--r--xlators/nfs/server/src/exports.h11
-rw-r--r--xlators/nfs/server/src/mount3-auth.c13
-rw-r--r--xlators/nfs/server/src/mount3.c120
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c10
-rw-r--r--xlators/nfs/server/src/nfs-common.c6
-rw-r--r--xlators/nfs/server/src/nfs.c141
-rw-r--r--xlators/nfs/server/src/nfs.h2
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c18
-rw-r--r--xlators/nfs/server/src/nfs3.c99
-rw-r--r--xlators/nfs/server/src/nfs3.h10
-rw-r--r--xlators/performance/io-cache/src/io-cache.c102
-rw-r--r--xlators/performance/io-cache/src/io-cache.h41
-rw-r--r--xlators/performance/io-threads/src/io-threads.c75
-rw-r--r--xlators/performance/io-threads/src/io-threads.h13
-rw-r--r--xlators/performance/md-cache/src/md-cache.c51
-rw-r--r--xlators/performance/write-behind/src/write-behind.c14
-rw-r--r--xlators/protocol/client/src/client-handshake.c3
-rw-r--r--xlators/protocol/client/src/client.c130
-rw-r--r--xlators/protocol/server/src/server-resolve.c9
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c8
-rw-r--r--xlators/storage/posix/src/posix-aio.c5
-rw-r--r--xlators/storage/posix/src/posix-handle.c6
-rw-r--r--xlators/storage/posix/src/posix-helpers.c37
-rw-r--r--xlators/storage/posix/src/posix.c157
-rw-r--r--xlators/storage/posix/src/posix.h12
177 files changed, 9731 insertions, 1136 deletions
diff --git a/Makefile.am b/Makefile.am
index d36f53055ea..c6f5618b541 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,3 +1,4 @@
+SOURCES = site.h
EXTRA_DIST = autogen.sh \
COPYING-GPLV2 COPYING-LGPLV3 \
INSTALL README.md AUTHORS THANKS NEWS \
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c
index 8c9872cfa53..5d08114c8c5 100644
--- a/api/src/glfs-mgmt.c
+++ b/api/src/glfs-mgmt.c
@@ -911,7 +911,8 @@ glfs_mgmt_init (struct glfs *fs)
if (!strcmp (cmd_args->volfile_server_transport, "unix")) {
ret = rpc_transport_unix_options_build (&options, host, 0);
} else {
- ret = rpc_transport_inet_options_build (&options, host, port);
+ ret = rpc_transport_inet_options_build (&options, host, port,
+ NULL);
}
if (ret)
diff --git a/build.sh b/build.sh
new file mode 100755
index 00000000000..2eb5ae75424
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+PACKAGES="automake libtool gperftools-devel gperftools-debuginfo gperftools-libs \
+ glib2-devel jemalloc jemalloc-devel fb-gcc flex bison openssl-devel libxml2-devel\
+ libacl-devel userspace-rcu-devel lvm2 python-devel"
+
+if [ $(/usr/lib/rpm/redhat/dist.sh --distnum) -eq "7" ]; then
+ PACKAGES="$PACKAGES libtirpc libtirpc-devel-0.2.4 devtoolset-4-binutils devtoolset-4-gcc devtoolset-4-runtime"
+elif [ $(/usr/lib/rpm/redhat/dist.sh --distnum) -eq "6" ]; then
+ PACKAGES="$PACKAGES libfbtirpc libfbtirpc-devel libgssglue libgssglue-devel devtoolset-2-binutils devtoolset-2-gcc devtoolset-2-runtime"
+else
+ echo "Centos $(/usr/lib/rpm/redhat/dist.sh --distnum) is not currently supported"
+ exit 1
+fi
+
+# Skip this for Jekins automated builds (they have these packages already)
+# as the sudo will cause the build to fail
+[ $USER == "svcscm" ] || sudo yum install $PACKAGES -y
+
+source ./build_env
+
+./autogen.sh || exit 1
+./configure $GF_CONF_OPTS
+make -j || exit 1
diff --git a/build_env b/build_env
new file mode 100644
index 00000000000..74f7c0256e1
--- /dev/null
+++ b/build_env
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+#
+# Note: The GF_CONF_OPTS (configure) options will only be used for dev or
+# test builds. For RPM builds the configure options are defined in the
+# glusterfs.spec.in file. ASAN is not enabled by for RPMs as it degrades
+# performance. In specific instances it can be enabled simply be appending
+# the --with-asan option in the %build step of the spec file.
+#
+
+GF_CONF_OPTS="--localstatedir=/var --sysconfdir /var/lib --prefix /usr --libdir /usr/lib64 \
+ --enable-fusermount --enable-api --with-jemalloc \
+ --with-ipv6-default --with-fbextras --disable-tiering"
+
+if [ -x /usr/lib/rpm/redhat/dist.sh ]; then
+ REDHAT_MAJOR=$(/usr/lib/rpm/redhat/dist.sh --distnum)
+else
+ REDHAT_MAJOR=0
+fi
+
+# Enable systemd support on CentOS >= 7
+if [ $REDHAT_MAJOR -ge 7 ]; then
+ GF_CONF_OPTS="$GF_CONF_OPTS --with-systemd"
+fi
+export GF_CONF_OPTS
+
+ASAN_ENABLED=0
+# Check if ASAN is enabled
+if [ "$ASAN_ENABLED" -eq "1" ]; then
+ GF_CONF_OPTS="$GF_CONF_OPTS --with-asan"
+fi
+
+if [ $REDHAT_MAJOR -eq "7" ]; then
+ GCC_BIN="/opt/rh/devtoolset-4/root/usr/bin/gcc"
+ GCC_LIB="/opt/rh/devtoolset-4/root/lib64"
+ DESTDIR='/' # pycompile is finicky in centos7 if --destdir is passed nothing.
+elif [ $REDHAT_MAJOR -eq "6" ]; then
+ ENGSHARE_GCC_PATH="/mnt/vol/engshare/third-party2/gcc"
+ GCC_BIN="$ENGSHARE_GCC_PATH/4.9.x/centos6-native/108cf83/bin/gcc"
+ GCC_LIB="$ENGSHARE_GCC_PATH/4.9.x/centos6-native/108cf83/lib64"
+else
+ echo "Centos $REDHAT_MAJOR is not currently supported"
+ exit 1
+fi
+
+export LIB_DIR="$GCC_LIB"
+export CC="$GCC_BIN"
+
+#export CC="/mnt/vol/engshare/third-party2/gcc/4.9.x/centos6-native/108cf83/bin/gcc"
+
+# If you think this should all be done in configure.ac you'd be 100%
+# correct; aside from the fact that it simply doesn't work when done there :).
+# You'll find the debug symbols are not present in resultant binaries nor is
+# the code un-optimized.
+export CFLAGS="-O0 -ggdb -fPIC -Wall -Werror -L${LIB_DIR}"
diff --git a/cli/src/cli.c b/cli/src/cli.c
index 2ecaae415d6..58fd9104f81 100644
--- a/cli/src/cli.c
+++ b/cli/src/cli.c
@@ -586,6 +586,11 @@ cli_rpc_init (struct cli_state *state)
int ret = -1;
int port = CLI_GLUSTERD_PORT;
xlator_t *this = NULL;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+#else
+ char *addr_family = "inet";
+#endif
this = THIS;
cli_rpc_prog = &cli_prog;
@@ -621,7 +626,8 @@ cli_rpc_init (struct cli_state *state)
goto out;
ret = dict_set_str (options, "transport.address-family",
- "inet");
+ addr_family);
+
if (ret)
goto out;
}
@@ -706,7 +712,7 @@ main (int argc, char *argv[])
if (ret)
goto out;
- cli_default_conn_timeout = 120;
+ cli_default_conn_timeout = 600;
cli_ten_minutes_timeout = 600;
ret = cli_state_init (&state);
diff --git a/configure.ac b/configure.ac
index 1ab3c996d3c..97eb137c752 100644
--- a/configure.ac
+++ b/configure.ac
@@ -33,7 +33,7 @@ if libtool --help 2>&1 | grep -q quiet; then
AM_LIBTOOLFLAGS="--quiet";
fi
-AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_HEADERS([config.h site.h])
AC_CONFIG_FILES([Makefile
libglusterfs/Makefile
@@ -72,6 +72,8 @@ AC_CONFIG_FILES([Makefile
xlators/cluster/Makefile
xlators/cluster/afr/Makefile
xlators/cluster/afr/src/Makefile
+ xlators/cluster/aha/Makefile
+ xlators/cluster/aha/src/Makefile
xlators/cluster/stripe/Makefile
xlators/cluster/stripe/src/Makefile
xlators/cluster/dht/Makefile
@@ -275,7 +277,19 @@ if test "x$enable_debug" = "xyes"; then
CFLAGS="${CFLAGS} -g -O0 -DDEBUG"
else
BUILD_DEBUG=no
- CFLAGS="${CFLAGS} -g -O2"
+ CFLAGS="${CFLAGS} -g"
+fi
+
+AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.]))
+if test "x$with_fbextras" = "xyes"; then
+ BUILD_FBEXTRAS=yes
+else
+ BUILD_FBEXTRAS=no
+fi
+
+AC_ARG_ENABLE([privport_prefer], AC_HELP_STRING([--disable-privport_prefer], [Disable preferred usage of privleged ports.]))
+if test "x$enable_privport_prefer" = "xno"; then
+ CFLAGS="${CFLAGS} -DNO_PRIVPORT"
fi
case $host_os in
@@ -349,6 +363,10 @@ AC_ARG_WITH([ocf],
)
AC_SUBST(OCF_SUBDIR)
+AC_ARG_WITH(asan,--with-asan,,with_asan="no")
+AC_ARG_WITH(tsan,--with-tsan,,with_tsan="no")
+AC_ARG_WITH(jemalloc,--with-jemalloc,,with_jemalloc="no")
+
# LEX needs a check
AC_PROG_LEX
if test "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then
@@ -908,6 +926,71 @@ AC_SUBST(GF_DISTRIBUTION)
GF_HOST_OS=""
GF_LDFLAGS="-rdynamic"
+BUILD_ASAN=no
+if test "x$with_asan" = "xyes"; then
+ echo -n "checking for address sanitizer (ASAN) support... "
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM()])
+ $CC conftest.c $CFLAGS -fsanitize=address -o conftest
+ ret=$?
+ rm -f conftest.o conftest
+ if test $ret -eq 0 ; then
+ echo "yes"
+ BUILD_ASAN=yes
+ GF_CFLAGS="$GF_CFLAGS -DASAN -fsanitize=address -O0 -ggdb"
+ GF_LDFLAGS="-gdb -static-libasan $GF_LDFLAGS"
+ else
+ echo "no"
+ echo "ERROR: ASAN not supported by compiler ($CC)"
+ exit 1
+ fi
+fi
+
+BUILD_TSAN=no
+if test "x$with_tsan" = "xyes"; then
+ echo -n "checking for thread sanitizer (TSAN) support... "
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM()])
+ $CC conftest.c $CFLAGS -fsanitize=address -o conftest > /dev/null 2> /dev/null
+ ret=$?
+ rm -f conftest.o conftest
+ if test $ret -eq 0 ; then
+ echo "yes"
+ BUILD_TSAN=yes
+ GF_CFLAGS="$GF_CFLAGS -fsanitize=thread -O0 -ggdb -fPIC -pie"
+ GF_LDFLAGS="-gdb -static-libtsan $GF_LDFLAGS"
+ else
+ echo "no"
+ echo "ERROR: TSAN not supported by compiler ($CC)"
+ exit 1
+ fi
+fi
+
+BUILD_JEMALLOC=no
+if test "x$with_jemalloc" = "xyes"; then
+ echo -n "checking for jemalloc support... "
+ AC_LANG_CONFTEST([AC_LANG_PROGRAM()])
+ $CC conftest.c $CFLAGS -ljemalloc -o conftest > /dev/null 2> /dev/null
+ ret=$?
+ rm -f conftest.o conftest
+ if test $ret -eq 0 ; then
+ echo "yes"
+ BUILD_JEMALLOC=yes
+ GF_LDFLAGS="-ljemalloc $GF_LDFLAGS"
+ else
+ echo "no"
+ echo "ERROR: jemalloc linking error"
+ exit 1
+ fi
+fi
+
+TESTER_CFLAGS=""
+dnl include tirpc for FB builds
+if test "x$BUILD_FBEXTRAS" = "xyes"; then
+ TIRPC_CFLAGS="-I/usr/include/tirpc"
+ GF_LDFLAGS="-ltirpc $GF_LDFLAGS"
+ GF_CFLAGS="$TIRPC_CFLAGS $GF_CFLAGS -DIPV6_DEFAULT -DGF_FBEXTRAS"
+ TESTER_CFLAGS="$TESTER_CFLAGS -ltirpc"
+fi
+
dnl check for gcc -Werror=format-security
saved_CFLAGS=$CFLAGS
CFLAGS="-Wformat -Werror=format-security"
@@ -1099,6 +1182,12 @@ AC_ARG_ENABLE([debug],
AC_HELP_STRING([--enable-debug],
[Enable debug build options.]))
+AC_ARG_ENABLE([mempool],
+ AC_HELP_STRING([--disable-mempool],
+ [Disable the Gluster memory pooler.]))
+if test "x$enable_mempool" = "xno"; then
+ CFLAGS="${CFLAGS} -DDISABLE_MEMPOOL"
+fi
# syslog section
AC_ARG_ENABLE([syslog],
@@ -1287,19 +1376,21 @@ CONTRIBDIR='$(top_srcdir)/contrib'
AC_SUBST(CONTRIBDIR)
GF_CPPDEFINES='-D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS)'
-GF_CPPINCLUDES='-include $(top_builddir)/config.h -I$(top_srcdir)/libglusterfs/src -I$(top_builddir)/libglusterfs/src'
+GF_CPPINCLUDES='-include $(top_builddir)/config.h -include $(top_builddir)/site.h -I$(top_srcdir)/libglusterfs/src -I$(top_builddir)/libglusterfs/src'
GF_CPPFLAGS="$GF_CPPFLAGS $GF_CPPDEFINES $GF_CPPINCLUDES"
AC_SUBST([GF_CPPFLAGS])
AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS")
AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS")
AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS")
+AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes")
AC_SUBST(GLUSTERD_WORKDIR)
AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd )
AC_SUBST(GLUSTERD_VOLFILE)
AC_SUBST(GLUSTERFS_LIBEXECDIR)
AC_SUBST(GLUSTERFSD_MISCDIR)
+AC_SUBST(TESTER_CFLAGS)
dnl pkg-config versioning
dnl
@@ -1361,4 +1452,7 @@ echo "POSIX ACLs : $BUILD_POSIX_ACLS"
echo "Data Classification : $BUILD_GFDB"
echo "firewalld-config : $BUILD_FIREWALLD"
echo "Experimental xlators : $BUILD_EXPERIMENTAL"
+echo "ASAN enabled : $BUILD_ASAN"
+echo "TSAN enabled : $BUILD_TSAN"
+echo "jemalloc enabled : $BUILD_JEMALLOC"
echo
diff --git a/fb-smoke.sh b/fb-smoke.sh
new file mode 100755
index 00000000000..a68b9414cd2
--- /dev/null
+++ b/fb-smoke.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+#
+# Tests
+#
+source ./test_env
+
+#
+# Helpers
+#
+function elapsed_since {
+ start=$1
+ (("$SECONDS" - "$start"))
+}
+
+function flaky {
+ local f
+ for f in ${KNOWN_FLAKY_TESTS}; do
+ if [ "$f" == "$1" ]; then
+ return 0
+ fi
+ done
+ return 1
+}
+
+function outfile {
+ printf "/tmp/%s.out" "$(echo "$f" | tr / -)"
+}
+
+function exit_test {
+ if [ "$STOP_ON_FAIL" -eq "1" ]; then
+ print_result
+ exit "$1"
+ fi
+}
+
+function test {
+ f=$1
+ printf "%s" "$f"
+ local start
+ start=$SECONDS
+ local out
+ out=$(outfile "$f")
+
+ for i in $(seq 1 "$ATTEMPT"); do
+ DEBUG=1 timeout --foreground "$TEST_TIMEOUT" prove -v "$f" &> "$out.$i"
+
+ if [ "$?" -eq "0" ]; then
+ SUCCESS=$SUCCESS+1
+ printf " PASS (%s s)\n" "$(elapsed_since $start)"
+ rm -f "$out.$i"
+ return 0
+ else
+ printf " %s" "($i/$ATTEMPT)"
+ fi
+ done
+
+ if [[ $? -eq 124 || $? -eq 137 ]]; then
+ FAILED_TESTS+=($f)
+ FAIL=$FAIL+1
+ printf " TIMEOUT (%s s)\n" "$(elapsed_since $start)"
+ exit_test 1
+ else
+ FAILED_TESTS+=($f)
+ FAIL=$FAIL+1
+ printf " FAIL (%s s)\n" "$(elapsed_since $start)"
+ exit_test 1
+ fi
+}
+
+function flakytest {
+ f=$1
+
+ if [ "$SKIP_FLAKY" -eq "1" ]; then
+ SKIP=$SKIP+1
+ else
+ printf "<flaky> "
+ test "$f"
+ fi
+}
+
+function print_result {
+ echo
+ echo "== RESULTS =="
+ echo "TESTS : $TOTAL"
+ echo "SUCCESS : $SUCCESS"
+ echo "FAIL : $FAIL"
+ echo "SKIP : $SKIP"
+
+ if [ "$FAIL" -gt "0" ]; then
+ echo
+ echo "== FAILED TESTS =="
+ echo "${FAILED_TESTS[@]}"
+ echo
+ echo "== LOGS =="
+ "ls /tmp/*.out.*"
+ echo
+ echo "== END =="
+ fi
+}
+
+function run_remote {
+ if [ ! -d "$FBCODE" ]; then
+ echo "fbcode does not exists. Please checkout fbcode"
+ return 1
+ fi
+
+ local flags=''
+ if [ "$VERBOSE" -eq "1" ]; then
+ flags="$flags -v"
+ fi
+
+ if [ "$VALGRIND" -eq "1" ]; then
+ flags="$flags --valgrind"
+ fi
+
+ if [ "$ASAN" -eq "1" ]; then
+ flags="$flags --asan"
+ fi
+
+ "$FBCODE/storage/gluster/gluster-build/fb-gluster-test.py" $flags --tester \
+ --n "$N" --hosts "$REMOTE_HOSTS" --tests "$REMOTE_TESTS"\
+ --flaky_tests "$REMOTE_FLAKY_TESTS"
+}
+
+#
+# Main
+#
+declare -i TOTAL=0
+declare -i SUCCESS=0
+declare -i FAIL=0
+declare -i SKIP=0
+declare -a FAILED_TESTS
+
+TEST_TIMEOUT=${TEST_TIMEOUT:=300}
+SKIP_FLAKY=${SKIP_FLAKY:=1}
+STOP_ON_FAIL=${STOP_ON_FAIL:=0}
+FBCODE=${FBCODE:="$HOME/fbsource/fbcode"}
+N=${N:=0}
+REMOTE_HOSTS=${REMOTE_HOSTS:="$(smcc ls-hosts -s gluster.build.ash | xargs)"}
+REMOTE=${REMOTE:=0}
+REMOTE_TESTS=${REMOTE_TESTS:=$DESIRED_TESTS}
+REMOTE_FLAKY_TESTS=${REMOTE_FLAKY_TESTS:=$KNOWN_FLAKY_TESTS}
+VERBOSE=${VERBOSE:=0}
+VALGRIND=${VALGRIND:=0}
+ASAN=${ASAN:=0}
+
+if [ "$REMOTE" -eq "1" ]; then
+ run_remote
+ exit $?
+fi
+
+if [ "$SKIP_FLAKY" -eq "0" ]; then
+ ATTEMPT=${ATTEMPT:=3}
+else
+ ATTEMPT=${ATTEMPT:=1}
+fi
+
+echo "== SETTINGS =="
+echo "TEST_TIMEOUT = $TEST_TIMEOUT s"
+echo "SKIP_FLAKY = $SKIP_FLAKY"
+echo "STOP_ON_FAIL = $STOP_ON_FAIL"
+echo "ATTEMPT = $ATTEMPT"
+echo "REMOTE = $REMOTE"
+echo "FBCODE = $FBCODE"
+echo
+
+# try cleaning up the environment
+rm -f /tmp/*.out.* || true
+
+# sanity check
+if ! cmp -s ./glusterfsd/src/.libs/glusterfsd $(which glusterfsd)
+then
+ echo "Installed gluster does not match local, perhaps you ought make install?"
+ exit 1
+fi
+
+echo "== TESTS =="
+for f in ${DESIRED_TESTS}
+do
+ TOTAL=$TOTAL+1
+ if flaky "$f"; then
+ flakytest "$f"
+ else
+ test "$f"
+ fi
+done
+
+print_result
+exit $FAIL
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
index 66c9a46a2be..6c30a955977 100644
--- a/glusterfs.spec.in
+++ b/glusterfs.spec.in
@@ -13,6 +13,10 @@
# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with debug
%{?_with_debug:%global _with_debug --enable-debug}
+# if you wish to compile an rpm with Facebook specfic extras...
+# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras
+%{?_with_fbextras:%global _with_fbextras --with-fbextras}
+
# if you wish to compile an rpm with cmocka unit testing...
# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka
%{?_with_cmocka:%global _with_cmocka --enable-cmocka}
@@ -196,6 +200,10 @@ BuildRequires: libxml2-devel openssl-devel
BuildRequires: libaio-devel libacl-devel
BuildRequires: python-devel
BuildRequires: python-ctypes
+%if ( 0%{?_with_fbextras:1} )
+BuildRequires: fb-libtirpc fb-libtirpc-devel
+BuildRequires: jemalloc jemalloc-devel
+%endif
BuildRequires: userspace-rcu-devel >= 0.7
%if ( 0%{?rhel} && 0%{?rhel} <= 6 )
BuildRequires: automake
@@ -513,6 +521,10 @@ Requires: %{name}-cli%{?_isa} = %{version}-%{release}
Requires: %{name}-libs%{?_isa} = %{version}-%{release}
# some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse
Requires: %{name}-fuse%{?_isa} = %{version}-%{release}
+%if ( 0%{?_with_fbextras:1} )
+Requires: fb-libtirpc >= 0.2.5-1
+Requires: jemalloc >= 3.6.0-1
+%endif
# self-heal daemon, rebalance, nfs-server etc. are actually clients
Requires: %{name}-api%{?_isa} = %{version}-%{release}
Requires: %{name}-client-xlators%{?_isa} = %{version}-%{release}
@@ -600,7 +612,8 @@ export CFLAGS
%{?_without_ocf} \
%{?_without_rdma} \
%{?_without_syslog} \
- %{?_without_tiering}
+ %{?_without_tiering} \
+ %{?_with_fbextras}
# fix hardening and remove rpath in shlibs
%if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 )
@@ -807,6 +820,12 @@ fi
%firewalld_reload
%endif
+%if ( 0%{?_with_fbextras:1} )
+if ! [ -f %{_sharedstatedir}/glusterd/glusterd.info ]; then
+ echo "UUID=$(/usr/bin/uuidgen)" >> %{_sharedstatedir}/glusterd/glusterd.info
+fi
+%endif
+
pidof -c -o %PPID -x glusterd &> /dev/null
if [ $? -eq 0 ]; then
kill -9 `pgrep -f gsyncd.py` &> /dev/null
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
index c47fa3883c9..a7c96d1e7a0 100644
--- a/glusterfsd/src/glusterfsd-mgmt.c
+++ b/glusterfsd/src/glusterfsd-mgmt.c
@@ -1903,9 +1903,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
switch (event) {
case RPC_CLNT_DISCONNECT:
- GF_LOG_OCCASIONALLY (log_ctr1, "glusterfsd-mgmt", GF_LOG_ERROR,
- "failed to connect with remote-host: %s (%s)",
- ctx->cmd_args.volfile_server, strerror (errno));
+ ctx->cmd_args.connect_attempts++;
+
+ gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
+ "Connect attempt with remote-host: %s (%s) (%u/%d)",
+ ctx->cmd_args.volfile_server,
+ strerror (errno),
+ ctx->cmd_args.connect_attempts,
+ ctx->cmd_args.max_connect_attempts);
if (!rpc->disabled) {
/*
* Check if dnscache is exhausted for current server
@@ -1916,8 +1921,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
break;
}
}
+
+ /* If we run out of servers, AND we attempted to connect
+ * max connect times, then we should return ENOTCONN
+ */
server = ctx->cmd_args.curr_server;
- if (server->list.next == &ctx->cmd_args.volfile_servers) {
+ if ((ctx->cmd_args.connect_attempts >=
+ ctx->cmd_args.max_connect_attempts) &&
+ server->list.next == &ctx->cmd_args.volfile_servers) {
if (!ctx->active)
need_term = 1;
emval = ENOTCONN;
@@ -1926,24 +1937,33 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
"Exhausted all volfile servers");
break;
}
- server = list_entry (server->list.next, typeof(*server), list);
- ctx->cmd_args.curr_server = server;
- ctx->cmd_args.volfile_server = server->volfile_server;
-
- ret = dict_set_str (rpc_trans->options, "remote-host",
- server->volfile_server);
- if (ret != 0) {
- gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
- "failed to set remote-host: %s",
+
+ /* If we exceed the # of connect attempts, we should
+ * move onto the next server
+ */
+ if (ctx->cmd_args.connect_attempts >=
+ ctx->cmd_args.max_connect_attempts || !server) {
+ server = list_entry (server->list.next,
+ typeof(*server), list);
+ ctx->cmd_args.curr_server = server;
+ ctx->cmd_args.volfile_server = server->volfile_server;
+
+ ret = dict_set_str (rpc_trans->options, "remote-host",
+ server->volfile_server);
+ if (ret != 0) {
+ gf_log ("glusterfsd-mgmt", GF_LOG_ERROR,
+ "failed to set remote-host: %s",
+ server->volfile_server);
+ if (!ctx->active)
+ need_term = 1;
+ emval = ENOTCONN;
+ break;
+ }
+ ctx->cmd_args.connect_attempts = 0;
+ gf_log ("glusterfsd-mgmt", GF_LOG_INFO,
+ "connecting to next volfile server %s",
server->volfile_server);
- if (!ctx->active)
- need_term = 1;
- emval = ENOTCONN;
- break;
}
- gf_log ("glusterfsd-mgmt", GF_LOG_INFO,
- "connecting to next volfile server %s",
- server->volfile_server);
break;
case RPC_CLNT_CONNECT:
rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn);
@@ -1960,7 +1980,7 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
}
}
-
+ ctx->cmd_args.connect_attempts = 0;
if (is_mgmt_rpc_reconnect)
glusterfs_mgmt_pmap_signin (ctx);
@@ -2120,6 +2140,7 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx)
int ret = -1;
int port = GF_DEFAULT_BASE_PORT;
char *host = NULL;
+ char *addr_family = NULL;
cmd_args = &ctx->cmd_args;
GF_VALIDATE_OR_GOTO (THIS->name, cmd_args->volfile_server, out);
@@ -2136,7 +2157,19 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx)
!strcmp (cmd_args->volfile_server_transport, "unix")) {
ret = rpc_transport_unix_options_build (&options, host, 0);
} else {
- ret = rpc_transport_inet_options_build (&options, host, port);
+ xlator_cmdline_option_t *cmd_option = NULL;
+
+ list_for_each_entry (cmd_option,
+ &cmd_args->xlator_options, cmd_args) {
+ if (!strcmp(cmd_option->key,
+ "transport.address-family")) {
+ addr_family = cmd_option->value;
+ break;
+ }
+ }
+
+ ret = rpc_transport_inet_options_build (&options, host, port,
+ addr_family);
}
if (ret)
goto out;
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
index 6c7a7c883fa..5022cfc22da 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -986,7 +986,7 @@ parse_opts (int key, char *arg, struct argp_state *state)
cmd_args->debug_mode = ENABLE_DEBUG_MODE;
break;
case ARGP_VOLFILE_MAX_FETCH_ATTEMPTS:
- cmd_args->max_connect_attempts = 1;
+ cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;
break;
case ARGP_DIRECT_IO_MODE_KEY:
@@ -1955,13 +1955,7 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)
}
}
- /*
- This option was made obsolete but parsing it for backward
- compatibility with third party applications
- */
- if (cmd_args->max_connect_attempts) {
- gf_msg ("glusterfs", GF_LOG_WARNING, 0, glusterfsd_msg_33);
- }
+ cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;
#ifdef GF_DARWIN_HOST_OS
if (cmd_args->mount_point)
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
index e442bede5db..b5c6b27b534 100644
--- a/glusterfsd/src/glusterfsd.h
+++ b/glusterfsd/src/glusterfsd.h
@@ -16,7 +16,7 @@
#define DEFAULT_GLUSTERD_VOLFILE CONFDIR "/glusterd.vol"
#define DEFAULT_CLIENT_VOLFILE CONFDIR "/glusterfs.vol"
#define DEFAULT_SERVER_VOLFILE CONFDIR "/glusterfsd.vol"
-
+#define DEFAULT_MAX_CONNECT_ATTEMPTS 200
#define DEFAULT_EVENT_POOL_SIZE 16384
#define ARGP_LOG_LEVEL_NONE_OPTION "NONE"
diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c
index 3e0e5936ae2..b3eb4e4df8c 100644
--- a/libglusterfs/src/client_t.c
+++ b/libglusterfs/src/client_t.c
@@ -366,6 +366,8 @@ client_destroy (client_t *client)
}
}
GF_FREE (client->auth.data);
+ GF_FREE (client->auth.username);
+ GF_FREE (client->auth.passwd);
GF_FREE (client->scratch_ctx.ctx);
GF_FREE (client->client_uid);
GF_FREE (client);
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index d7cd0ad015d..e533992556b 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -181,26 +181,16 @@ gf_rev_dns_lookup (const char *ip)
{
char *fqdn = NULL;
int ret = 0;
- struct sockaddr_in sa = {0};
- char host_addr[256] = {0, };
GF_VALIDATE_OR_GOTO ("resolver", ip, out);
- sa.sin_family = AF_INET;
- inet_pton (AF_INET, ip, &sa.sin_addr);
- ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr,
- sizeof (host_addr), NULL, 0, 0);
-
+ /* Get the FQDN */
+ ret = gf_get_hostname_from_ip ((char *)ip, &fqdn);
if (ret != 0) {
gf_msg ("resolver", GF_LOG_INFO, errno,
LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve "
"hostname for %s", ip);
- goto out;
}
-
- /* Get the FQDN */
- fqdn = gf_strdup (host_addr);
-
out:
return fqdn;
}
@@ -3107,11 +3097,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
char *client_ip_copy = NULL;
char *tmp = NULL;
char *ip = NULL;
+ size_t addr_sz = 0;
/* if ipv4, reverse lookup the hostname to
* allow FQDN based rpc authentication
*/
- if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) {
+ if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) &&
+ !valid_ipv4_address (client_ip, strlen (client_ip), 0)) {
/* most times, we get a.b.c.d:port form, so check that */
client_ip_copy = gf_strdup (client_ip);
if (!client_ip_copy)
@@ -3124,12 +3116,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) {
client_sockaddr = (struct sockaddr *)&client_sock_in;
+ addr_sz = sizeof (client_sock_in);
client_sock_in.sin_family = AF_INET;
ret = inet_pton (AF_INET, ip,
(void *)&client_sock_in.sin_addr.s_addr);
} else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) {
client_sockaddr = (struct sockaddr *) &client_sock_in6;
+ addr_sz = sizeof (client_sock_in6);
client_sock_in6.sin6_family = AF_INET6;
ret = inet_pton (AF_INET6, ip,
@@ -3143,8 +3137,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)
goto out;
}
+ /* You cannot just use sizeof (*client_sockaddr), as per the man page
+ * the (getnameinfo) size must be the size of the underlying sockaddr
+ * struct e.g. sockaddr_in6 or sockaddr_in. Failure to do so will
+ * break IPv6 hostname resolution (IPv4 will work only because
+ * the sockaddr_in struct happens to be of the correct size).
+ */
ret = getnameinfo (client_sockaddr,
- sizeof (*client_sockaddr),
+ addr_sz,
client_hostname, sizeof (client_hostname),
NULL, 0, 0);
if (ret) {
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index 51264237ab4..5e338f69528 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -642,7 +642,7 @@ gf_time_fmt (char *dst, size_t sz_dst, time_t utime, unsigned int fmt)
if (timefmt_last == (gf_timefmts) - 1)
_gf_timestuff (&timefmt_last, &fmts, &zeros);
if (timefmt_last < fmt) fmt = gf_timefmt_default;
- if (utime && gmtime_r (&utime, &tm) != NULL) {
+ if (utime && localtime_r (&utime, &tm) != NULL) {
strftime (dst, sz_dst, fmts[fmt], &tm);
} else {
strncpy (dst, "N/A", sz_dst);
diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h
index fbaac76b9ee..771ed983d32 100644
--- a/libglusterfs/src/compat.h
+++ b/libglusterfs/src/compat.h
@@ -479,6 +479,12 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);
#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0);
#endif
+#ifdef GF_BSD_HOST_OS
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+
+#ifndef IPV6_DEFAULT
+
#ifndef IXDR_GET_LONG
#define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf))
#endif
@@ -495,6 +501,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);
#define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG(buf, (long)(v))
#endif
+#endif /* IPV6_DEFAULT */
+
#if defined(__GNUC__) && !defined(RELAX_POISONING)
/* Use run API, see run.h */
#include <stdlib.h> /* system(), mkostemp() */
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c
index 25ddff0d8c4..6a61e641e19 100644
--- a/libglusterfs/src/dict.c
+++ b/libglusterfs/src/dict.c
@@ -27,6 +27,45 @@
#include "statedump.h"
#include "libglusterfs-messages.h"
+/* this goes with the bucket_size lookup table below */
+#define NUM_DISTINCT_SIZES_32_BIT 32
+
+/* this bucket_size lookup table is borrowed from GNU libstdc++ */
+static const uint32_t bucket_sizes[NUM_DISTINCT_SIZES_32_BIT] = {
+ /* 0 */ 5ul,
+ /* 1 */ 11ul,
+ /* 2 */ 23ul,
+ /* 3 */ 47ul,
+ /* 4 */ 97ul,
+ /* 5 */ 199ul,
+ /* 6 */ 409ul,
+ /* 7 */ 823ul,
+ /* 8 */ 1741ul,
+ /* 9 */ 3469ul,
+ /* 10 */ 6949ul,
+ /* 11 */ 14033ul,
+ /* 12 */ 28411ul,
+ /* 13 */ 57557ul,
+ /* 14 */ 116731ul,
+ /* 15 */ 236897ul,
+ /* 16 */ 480881ul,
+ /* 17 */ 976369ul,
+ /* 18 */ 1982627ul,
+ /* 19 */ 4026031ul,
+ /* 20 */ 8175383ul,
+ /* 21 */ 16601593ul,
+ /* 22 */ 33712729ul,
+ /* 23 */ 68460391ul,
+ /* 24 */ 139022417ul,
+ /* 25 */ 282312799ul,
+ /* 26 */ 573292817ul,
+ /* 27 */ 1164186217ul,
+ /* 28 */ 2364114217ul,
+ /* 29 */ 4294967291ul,
+ /* 30 */ 4294967291ul,
+ /* 31 */ 4294967291ul,
+};
+
struct dict_cmp {
dict_t *dict;
gf_boolean_t (*value_ignore) (char *k);
@@ -47,7 +86,7 @@ get_new_data ()
}
dict_t *
-get_new_dict_full (int size_hint)
+get_new_dict_full (uint32_t size_hint)
{
dict_t *dict = mem_get0 (THIS->ctx->dict_pool);
@@ -67,17 +106,8 @@ get_new_dict_full (int size_hint)
dict->members = &dict->members_internal;
}
else {
- /*
- * We actually need to allocate space for size_hint *pointers*
- * but we actually allocate space for one *structure*. Since
- * a data_pair_t consists of five pointers, we're wasting four
- * pointers' worth for N=1, and will overrun what we allocated
- * for N>5. If anybody ever starts using size_hint, we'll need
- * to fix this.
- */
- GF_ASSERT (size_hint <=
- (sizeof(data_pair_t) / sizeof(data_pair_t *)));
- dict->members = mem_get0 (THIS->ctx->dict_pair_pool);
+ dict->members = GF_CALLOC (size_hint, sizeof (data_pair_t *),
+ gf_common_mt_data_pair_t);
if (!dict->members) {
mem_put (dict);
return NULL;
@@ -108,6 +138,35 @@ dict_new (void)
return dict;
}
+dict_t *
+dict_new_by_size (uint32_t num)
+{
+ int32_t highest_bit = 0;
+ uint32_t bucket_size = 0;
+ dict_t *dict = NULL;
+
+ if (num == 0)
+ goto out;
+
+#ifdef _GNU_SOURCE
+ highest_bit = 32 - __builtin_clz (num);
+#else
+ while (num != 0) {
+ highest_bit++;
+ num >>= 1;
+ }
+#endif
+
+ bucket_size = bucket_sizes[highest_bit - 1];
+ dict = get_new_dict_full (bucket_size);
+
+ if (dict)
+ dict_ref (dict);
+
+out:
+ return dict;
+}
+
int32_t
is_data_equal (data_t *one,
data_t *two)
@@ -268,7 +327,7 @@ err_out:
static data_pair_t *
dict_lookup_common (dict_t *this, char *key)
{
- int hashval = 0;
+ uint32_t hashval = 0;
if (!this || !key) {
gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL,
LG_MSG_INVALID_ARG,
@@ -279,7 +338,7 @@ dict_lookup_common (dict_t *this, char *key)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1)
+ if (this->hash_size > 1)
hashval = SuperFastHash (key, strlen (key)) % this->hash_size;
data_pair_t *pair;
@@ -319,7 +378,7 @@ dict_lookup (dict_t *this, char *key, data_t **data)
static int32_t
dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)
{
- int hashval = 0;
+ uint32_t hashval = 0;
data_pair_t *pair;
char key_free = 0;
int tmp = 0;
@@ -336,7 +395,7 @@ dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1) {
+ if (this->hash_size > 1) {
tmp = SuperFastHash (key, strlen (key));
hashval = (tmp % this->hash_size);
}
@@ -478,7 +537,7 @@ dict_get (dict_t *this, char *key)
void
dict_del (dict_t *this, char *key)
{
- int hashval = 0;
+ uint32_t hashval = 0;
if (!this || !key) {
gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL,
@@ -491,7 +550,7 @@ dict_del (dict_t *this, char *key)
/* If the divisor is 1, the modulo is always 0,
* in such case avoid hash calculation.
*/
- if (this->hash_size != 1)
+ if (this->hash_size > 1)
hashval = SuperFastHash (key, strlen (key)) % this->hash_size;
data_pair_t *pair = this->members[hashval];
diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h
index c5b82677e2e..5259c6befa1 100644
--- a/libglusterfs/src/dict.h
+++ b/libglusterfs/src/dict.h
@@ -79,9 +79,9 @@ struct _data_pair {
struct _dict {
unsigned char is_static:1;
- int32_t hash_size;
- int32_t count;
- int32_t refcount;
+ uint32_t hash_size;
+ uint32_t count;
+ uint32_t refcount;
data_pair_t **members;
data_pair_t *members_list;
char *extra_free;
@@ -156,9 +156,11 @@ void *data_to_ptr (data_t *data);
data_t *get_new_data ();
data_t * data_copy (data_t *old);
-dict_t *get_new_dict_full (int size_hint);
+dict_t *get_new_dict_full (uint32_t size_hint);
dict_t *get_new_dict ();
+#define dict_for_each(d, c) for (c = d->members_list; c; c = c->next)
+
int dict_foreach (dict_t *this,
int (*fn)(dict_t *this,
char *key,
@@ -196,6 +198,7 @@ int dict_keys_join (void *value, int size, dict_t *dict,
/* CLEANED UP FUNCTIONS DECLARATIONS */
GF_MUST_CHECK dict_t *dict_new (void);
+GF_MUST_CHECK dict_t *dict_new_by_size (uint32_t num);
dict_t *dict_copy_with_ref (dict_t *this, dict_t *new);
GF_MUST_CHECK int dict_reset (dict_t *dict);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 6e2d370605b..59f3df19420 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -78,6 +78,7 @@
#define ZR_STRICT_VOLFILE_CHECK "strict-volfile-check"
#define ZR_DUMP_FUSE "dump-fuse"
#define ZR_FUSE_MOUNTOPTS "fuse-mountopts"
+#define IO_THREADS_QUEUE_SIZE_KEY "io-thread-queue-size"
#define GF_XATTR_CLRLK_CMD "glusterfs.clrlk"
#define GF_XATTR_PATHINFO_KEY "trusted.glusterfs.pathinfo"
@@ -283,6 +284,51 @@
#define GF_LK_ADVISORY 0
#define GF_LK_MANDATORY 1
+#define GF_CHECK_XATTR_KEY_AND_GOTO(key, cmpkey, errval, lbl) \
+ do { \
+ if (key && strcmp (key, cmpkey) == 0) { \
+ errval = -EINVAL; \
+ goto lbl; \
+ } \
+ } while (0); \
+
+
+typedef enum {
+ GF_FOP_PRI_UNSPEC = -1, /* Priority not specified */
+ GF_FOP_PRI_HI = 0, /* low latency */
+ GF_FOP_PRI_NORMAL, /* normal */
+ GF_FOP_PRI_LO, /* bulk */
+ GF_FOP_PRI_LEAST, /* least */
+ GF_FOP_PRI_MAX,
+} gf_fop_pri_t;
+
+/* For backwards compatibility in io-threads */
+typedef gf_fop_pri_t iot_pri_t;
+#define IOT_PRI_UNSPEC GF_FOP_PRI_UNSPEC
+#define IOT_PRI_HI GF_FOP_PRI_HI
+#define IOT_PRI_NORMAL GF_FOP_PRI_NORMAL
+#define IOT_PRI_LO GF_FOP_PRI_LO
+#define IOT_PRI_LEAST GF_FOP_PRI_LEAST
+#define IOT_PRI_MAX GF_FOP_PRI_MAX
+
+static const char* FOP_PRI_STRINGS[] = {
+ "HIGH",
+ "NORMAL",
+ "LOW",
+ "LEAST"
+};
+
+static inline const char *fop_pri_to_string (gf_fop_pri_t pri)
+{
+ if (pri < 0)
+ return "UNSPEC";
+
+ if (pri >= GF_FOP_PRI_MAX)
+ return "INVALID";
+
+ return FOP_PRI_STRINGS[pri];
+}
+
const char *fop_enum_to_pri_string (glusterfs_fop_t fop);
const char *fop_enum_to_string (glusterfs_fop_t fop);
@@ -330,6 +376,7 @@ struct _cmd_args {
uint32_t log_buf_size;
uint32_t log_flush_timeout;
int32_t max_connect_attempts;
+ unsigned int connect_attempts;
char *print_exports;
char *print_netgroups;
/* advanced options */
diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c
index 17cd68fc206..fa3ac840c43 100644
--- a/libglusterfs/src/iobuf.c
+++ b/libglusterfs/src/iobuf.c
@@ -30,8 +30,8 @@ struct iobuf_init_config gf_iobuf_init_config[] = {
{8 * 1024, 128},
{32 * 1024, 64},
{128 * 1024, 32},
- {256 * 1024, 8},
- {1 * 1024 * 1024, 2},
+ {256 * 1024, 64},
+ {1 * 1024 * 1024, 64},
};
int
diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c
index 611615949fa..d51e64768aa 100644
--- a/libglusterfs/src/latency.c
+++ b/libglusterfs/src/latency.c
@@ -21,6 +21,7 @@
#include "statedump.h"
#include "libglusterfs-messages.h"
+static int gf_set_fop_from_fn_pointer_warning;
void
gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn)
{
@@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void
fop = GF_FOP_READDIRP;
else if (fops->getspec == *(fop_getspec_t *)&fn)
fop = GF_FOP_GETSPEC;
- else
- fop = -1;
+ else if (fops->ipc == *(fop_ipc_t *)&fn)
+ fop = GF_FOP_IPC;
+ else {
+ fop = GF_FOP_NULL;
+ GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning,
+ "latency",
+ GF_LOG_WARNING,
+ "Unknown FOP type");
+ }
frame->op = fop;
}
@@ -129,6 +137,13 @@ gf_update_latency (call_frame_t *frame)
elapsed = (end->tv_sec - begin->tv_sec) * 1e6
+ (end->tv_usec - begin->tv_usec);
+ if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) {
+ gf_log ("[core]", GF_LOG_WARNING,
+ "Invalid frame op value: %d",
+ frame->op);
+ return;
+ }
+
lat = &frame->this->latencies[frame->op];
lat->total += elapsed;
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
index 88fbdf58319..d189be7960e 100644
--- a/libglusterfs/src/mem-pool.c
+++ b/libglusterfs/src/mem-pool.c
@@ -454,6 +454,10 @@ mem_get0 (struct mem_pool *mem_pool)
void *
mem_get (struct mem_pool *mem_pool)
{
+#ifdef DISABLE_MEMPOOL
+ return GF_CALLOC (1, mem_pool->real_sizeof_type,
+ gf_common_mt_mem_pool);
+#else
struct list_head *list = NULL;
void *ptr = NULL;
int *in_use = NULL;
@@ -525,9 +529,11 @@ fwd_addr_out:
UNLOCK (&mem_pool->lock);
return ptr;
+#endif /* DISABLE_MEMPOOL */
}
+#ifndef DISABLE_MEMPOOL
static int
__is_member (struct mem_pool *pool, void *ptr)
{
@@ -546,11 +552,16 @@ __is_member (struct mem_pool *pool, void *ptr)
return 1;
}
+#endif
void
mem_put (void *ptr)
{
+#ifdef DISABLE_MEMPOOL
+ GF_FREE (ptr);
+ return;
+#else
struct list_head *list = NULL;
int *in_use = NULL;
void *head = NULL;
@@ -628,6 +639,7 @@ mem_put (void *ptr)
}
}
UNLOCK (&pool->lock);
+#endif /* DISABLE_MEMPOOL */
}
void
diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h
index afa52d8bc45..fc7bf9e5996 100644
--- a/libglusterfs/src/mem-types.h
+++ b/libglusterfs/src/mem-types.h
@@ -168,6 +168,7 @@ enum gf_common_mem_types_ {
/*lock migration*/
gf_common_mt_lock_mig,
gf_common_mt_pthread_t,
+ gf_common_ping_local_t,
gf_common_mt_end
};
#endif
diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c
index f7b2bea2f30..903303d1380 100644
--- a/libglusterfs/src/timespec.c
+++ b/libglusterfs/src/timespec.c
@@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta)
ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000);
ts->tv_sec += delta.tv_sec;
}
+
+void timespec_sub (const struct timespec *begin, const struct timespec *end,
+ struct timespec *res)
+{
+ if (end->tv_nsec < begin->tv_nsec) {
+ res->tv_sec = end->tv_sec - begin->tv_sec - 1;
+ res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec;
+ } else {
+ res->tv_sec = end->tv_sec - begin->tv_sec;
+ res->tv_nsec = end->tv_nsec - begin->tv_nsec;
+ }
+}
diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h
index f37194b97cf..9c393ee7166 100644
--- a/libglusterfs/src/timespec.h
+++ b/libglusterfs/src/timespec.h
@@ -20,5 +20,8 @@
void timespec_now (struct timespec *ts);
void timespec_adjust_delta (struct timespec *ts, struct timespec delta);
+void timespec_sub (const struct timespec *begin,
+ const struct timespec *end,
+ struct timespec *res);
#endif /* __INCLUDE_TIMESPEC_H__ */
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index 3c1cde50fa0..b2529d3c4f7 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -117,6 +117,14 @@ out:
}
+static const char *xlator_lib_path (void)
+{
+ const char *libdir_env = getenv ("GLUSTER_LIBDIR");
+
+ return libdir_env ? libdir_env : XLATORDIR;
+}
+
+
int
xlator_volopt_dynload (char *xlator_type, void **dl_handle,
volume_opt_list_t *opt_list)
@@ -130,9 +138,11 @@ xlator_volopt_dynload (char *xlator_type, void **dl_handle,
/* socket.so doesn't fall under the default xlator directory, hence we
* need this check */
if (!strstr(xlator_type, "rpc-transport"))
- ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xlator_type);
+ ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (),
+ xlator_type);
else
- ret = gf_asprintf (&name, "%s/%s.so", XLATORPARENTDIR, xlator_type);
+ ret = gf_asprintf (&name, "%s/../%s.so", xlator_lib_path (),
+ xlator_type);
if (-1 == ret) {
goto out;
}
@@ -183,7 +193,7 @@ xlator_dynload (xlator_t *xl)
INIT_LIST_HEAD (&xl->volume_options);
- ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xl->type);
+ ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), xl->type);
if (-1 == ret) {
goto out;
}
diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h
index 70e6f0a108d..2e04893c487 100644
--- a/libglusterfs/src/xlator.h
+++ b/libglusterfs/src/xlator.h
@@ -927,6 +927,7 @@ struct _xlator {
gf_loglevel_t loglevel; /* Log level for translator */
+ fop_latency_t client_latency;
/* for latency measurement */
fop_latency_t latencies[GF_FOP_MAXVALUE];
diff --git a/rfc.sh b/rfc.sh
index 998918ef04e..f6f0e4f3a12 100755
--- a/rfc.sh
+++ b/rfc.sh
@@ -17,7 +17,7 @@ done
shift $((OPTIND-1))
-branch="release-3.8";
+branch="release-3.8-fb";
set_hooks_commit_msg()
{
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c
index a7ff866ac99..7ce066dec5f 100644
--- a/rpc/rpc-lib/src/rpc-clnt-ping.c
+++ b/rpc/rpc-lib/src/rpc-clnt-ping.c
@@ -18,6 +18,7 @@
#include "mem-pool.h"
#include "xdr-rpc.h"
#include "rpc-common-xdr.h"
+#include "timespec.h"
char *clnt_ping_procs[GF_DUMP_MAXVALUE] = {
@@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = {
.procnames = clnt_ping_procs,
};
+struct ping_local {
+ struct rpc_clnt *rpc;
+ struct timespec submit_time;
+};
+
/* Must be called under conn->lock */
static int
__rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk)
@@ -166,16 +172,48 @@ out:
return;
}
+void
+_update_client_latency (const rpc_clnt_connection_t *conn,
+ call_frame_t *frame,
+ uint64_t elapsed_usec)
+{
+ fop_latency_t *lat;
+
+ lat = &frame->this->client_latency;
+
+ if (elapsed_usec < lat->min) {
+ lat->min = elapsed_usec;
+ }
+
+ if (elapsed_usec > lat->max) {
+ lat->max = elapsed_usec;
+ }
+
+ lat->total += elapsed_usec;
+ lat->count++;
+ lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count;
+ gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, "
+ "avg: %0.6lf ms, count:%ld",
+ conn->trans->peerinfo.identifier, elapsed_usec / 1000.0,
+ lat->mean / 1000.0, lat->count);
+}
+
int
rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
void *myframe)
{
- struct rpc_clnt *rpc = NULL;
+ struct ping_local *local = NULL;
xlator_t *this = NULL;
rpc_clnt_connection_t *conn = NULL;
+
call_frame_t *frame = NULL;
struct timespec timeout = {0, };
+ struct timespec now;
+ struct timespec delta;
+ int64_t latency_usec = 0;
+ int ret = 0;
int unref = 0;
+ gf_boolean_t call_notify = _gf_false;
if (!myframe) {
gf_log (THIS->name, GF_LOG_WARNING,
@@ -185,14 +223,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
frame = myframe;
this = frame->this;
- rpc = frame->local;
- frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */
- conn = &rpc->conn;
+ local = frame->local;
+ conn = &local->rpc->conn;
pthread_mutex_lock (&conn->lock);
{
if (req->rpc_status == -1) {
- unref = rpc_clnt_remove_ping_timer_locked (rpc);
+ unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
if (unref) {
gf_log (this->name, GF_LOG_WARNING,
"socket or ib related error");
@@ -207,8 +244,15 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
goto unlock;
}
- unref = rpc_clnt_remove_ping_timer_locked (rpc);
- if (__rpc_clnt_rearm_ping_timer (rpc,
+ timespec_now (&now);
+ timespec_sub (&local->submit_time, &now, &delta);
+ latency_usec = delta.tv_sec * 1000000UL +
+ delta.tv_nsec / 1000UL;
+
+ _update_client_latency (conn, frame, latency_usec);
+ call_notify = _gf_true;
+ unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
+ if (__rpc_clnt_rearm_ping_timer (local->rpc,
rpc_clnt_start_ping) == -1) {
gf_log (this->name, GF_LOG_WARNING,
"failed to set the ping timer");
@@ -217,12 +261,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
}
unlock:
pthread_mutex_unlock (&conn->lock);
+
+ if (call_notify) {
+ ret = local->rpc->notifyfn (local->rpc, this,
+ RPC_CLNT_PING, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "RPC_CLNT_PING notify failed");
+ }
+ }
out:
if (unref)
- rpc_clnt_unref (rpc);
+ rpc_clnt_unref (local->rpc);
- if (frame)
+ if (frame) {
+ GF_FREE (frame->local);
+ frame->local = NULL;
STACK_DESTROY (frame->root);
+ }
return 0;
}
@@ -232,18 +288,27 @@ rpc_clnt_ping (struct rpc_clnt *rpc)
call_frame_t *frame = NULL;
int32_t ret = -1;
rpc_clnt_connection_t *conn = NULL;
+ struct ping_local *local = NULL;
conn = &rpc->conn;
+ local = GF_MALLOC (sizeof(struct ping_local), gf_common_ping_local_t);
+ if (!local)
+ return ret;
frame = create_frame (THIS, THIS->ctx->pool);
- if (!frame)
+ if (!frame) {
+ GF_FREE (local);
return ret;
+ }
- frame->local = rpc;
+ local->rpc = rpc;
+ timespec_now (&local->submit_time);
+ frame->local = local;
ret = rpc_clnt_submit (rpc, &clnt_ping_prog,
GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0,
NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL);
if (ret) {
+ /* FIXME: should we free the frame here? Methinks so! */
gf_log (THIS->name, GF_LOG_ERROR,
"failed to start ping timer");
}
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.h b/rpc/rpc-lib/src/rpc-clnt-ping.h
index d92e5054190..7354679d50f 100644
--- a/rpc/rpc-lib/src/rpc-clnt-ping.h
+++ b/rpc/rpc-lib/src/rpc-clnt-ping.h
@@ -9,7 +9,7 @@
*/
-#define RPC_DEFAULT_PING_TIMEOUT 30
+#define RPC_DEFAULT_PING_TIMEOUT 300
void
rpc_clnt_check_and_start_ping (struct rpc_clnt *rpc_ptr);
int
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index d3df5560a8b..406efdb2d4f 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -21,6 +21,8 @@
#include "xdr-rpc.h"
#include "rpc-common-xdr.h"
+#pragma GCC diagnostic ignored "-Wformat="
+
void
rpc_clnt_reply_deinit (struct rpc_req *req, struct mem_pool *pool);
@@ -122,6 +124,7 @@ call_bail (void *data)
struct iovec iov = {0,};
char peerid[UNIX_PATH_MAX] = {0};
gf_boolean_t need_unref = _gf_false;
+ gf_boolean_t timedout_frames = _gf_false;
GF_VALIDATE_OR_GOTO ("client", data, out);
@@ -198,7 +201,6 @@ call_bail (void *data)
"--",
trav->rpcreq->procnum, trav->rpcreq->xid, frame_sent,
conn->frame_timeout, peerid);
-
clnt = rpc_clnt_ref (clnt);
trav->rpcreq->rpc_status = -1;
trav->rpcreq->cbkfn (trav->rpcreq, &iov, 1, trav->frame);
@@ -207,7 +209,30 @@ call_bail (void *data)
clnt = rpc_clnt_unref (clnt);
list_del_init (&trav->list);
mem_put (trav);
- }
+ timedout_frames = _gf_true;
+ }
+ /* So what on earth is this you ask? It was observed while testing
+ * the SHD threading code, that under high loads SHD/AFR related
+ * SyncOps & SyncTasks can actually hang/deadlock as the transport
+ * disconnected event never gets bubbled up correctly. Various
+ * tests indicated the ping timeouts worked fine, while "frame timeouts"
+ * did not. The only difference? Ping timeouts actually disconnect
+ * the transport while frame timeouts did not. So from a high-level we
+ * know this prevents deadlock as subsequent tests showed the deadlocks
+ * no longer ocurred (after this change). That said, there may be some
+ * more elegant solution. For now though, forcing a reconnect is
+ * preferential vs hanging clients or deadlocking the SHD.
+ *
+ * I suspect the culprit might be in
+ * afr-self-heal-common.c:afr_sh_common_lookup_cbk as this function
+ * will early-return if the callcount never actually reaches 0,
+ * which ordinarily is fine (you only want your callback called if
+ * the Nth response is received), but what happens if callcount
+ * never rearches 0? The callback won't be called. Theory at this
+ * point, but a good spot to start when we get a chance.
+ */
+ if (timedout_frames)
+ rpc_transport_disconnect (clnt->conn.trans);
out:
rpc_clnt_unref (clnt);
if (need_unref)
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index df19a0c403f..5ad4fd42298 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -19,6 +19,7 @@
typedef enum {
RPC_CLNT_CONNECT,
RPC_CLNT_DISCONNECT,
+ RPC_CLNT_PING,
RPC_CLNT_MSG,
RPC_CLNT_DESTROY
} rpc_clnt_event_t;
diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c
index e224dcc022e..5556740ca81 100644
--- a/rpc/rpc-lib/src/rpc-transport.c
+++ b/rpc/rpc-lib/src/rpc-transport.c
@@ -166,6 +166,19 @@ out:
+int rpc_transport_lib_path (char **name, char *type)
+{
+ int ret = -1;
+ char *libdir_env = getenv ("GLUSTER_LIBDIR");
+
+ ret = libdir_env == NULL
+ ? gf_asprintf (name, "%s/%s.so", RPC_TRANSPORTDIR, type)
+ : gf_asprintf (name, "%s/rpc-transport/%s.so", libdir_env, type);
+ return ret;
+}
+
+
+
rpc_transport_t *
rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)
{
@@ -274,7 +287,7 @@ rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)
goto fail;
}
- ret = gf_asprintf (&name, "%s/%s.so", RPC_TRANSPORTDIR, type);
+ ret = rpc_transport_lib_path (&name, type);
if (-1 == ret) {
goto fail;
}
@@ -652,18 +665,37 @@ out:
return ret;
}
+/** @brief build a dictionary containing basic transport options.
+ *
+ * @param[out] options: will be set to a newly created dictionary on success.
+ * @param[in] hostname: desired target hostname.
+ * @param[in] port: desired target port.
+ * @param[in] addr_family (optional): desired address family. If NULL,
+ * default will be used.
+ *
+ * @returns zero on success.
+ */
int
rpc_transport_inet_options_build (dict_t **options, const char *hostname,
- int port)
+ int port, const char *addr_family)
{
dict_t *dict = NULL;
char *host = NULL;
int ret = -1;
+#ifdef IPV6_DEFAULT
+ const char *addr_family_default = "inet6";
+#else
+ const char *addr_family_default = "inet";
+#endif
GF_ASSERT (options);
GF_ASSERT (hostname);
GF_ASSERT (port >= 1024);
+ if (!addr_family) {
+ addr_family = addr_family_default;
+ }
+
dict = dict_new ();
if (!dict)
goto out;
@@ -688,6 +720,14 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname,
goto out;
}
+ ret = dict_set_str (dict, "transport.address-family",
+ (char *)addr_family);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set address-family to %s", addr_family);
+ goto out;
+ }
+
ret = dict_set_str (dict, "transport-type", "socket");
if (ret) {
gf_log (THIS->name, GF_LOG_WARNING,
diff --git a/rpc/rpc-lib/src/rpc-transport.h b/rpc/rpc-lib/src/rpc-transport.h
index f0add065065..0f555462ea4 100644
--- a/rpc/rpc-lib/src/rpc-transport.h
+++ b/rpc/rpc-lib/src/rpc-transport.h
@@ -311,5 +311,6 @@ rpc_transport_unix_options_build (dict_t **options, char *filepath,
int frame_timeout);
int
-rpc_transport_inet_options_build (dict_t **options, const char *hostname, int port);
+rpc_transport_inet_options_build (dict_t **options, const char *hostname,
+ int port, const char *addr_family);
#endif /* __RPC_TRANSPORT_H__ */
diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c
index 5a5c65114c4..bc661043674 100644
--- a/rpc/rpc-lib/src/rpcsvc.c
+++ b/rpc/rpc-lib/src/rpcsvc.c
@@ -37,9 +37,15 @@
#include <stdarg.h>
#include <stdio.h>
+#ifdef IPV6_DEFAULT
+#include <netconfig.h>
+#endif
+
#include "xdr-rpcclnt.h"
#include "glusterfs-acl.h"
+#pragma GCC diagnostic ignored "-Wformat="
+
struct rpcsvc_program gluster_dump_prog;
#define rpcsvc_alloc_request(svc, request) \
@@ -1392,6 +1398,90 @@ rpcsvc_error_reply (rpcsvc_request_t *req)
return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL);
}
+#ifdef IPV6_DEFAULT
+int
+rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port, gf_boolean_t unregister_first)
+{
+ const int IP_BUF_LEN = 64;
+ char addr_buf[IP_BUF_LEN];
+
+ int err = 0;
+ bool_t success = 0;
+ struct netconfig *nc;
+ struct netbuf *nb;
+
+ if (!newprog) {
+ goto out;
+ }
+
+ nc = getnetconfigent ("tcp6");
+ if (!nc) {
+ err = -1;
+ goto out;
+ }
+
+
+ err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff,
+ port & 0xff);
+ if (err < 0) {
+ err = -1;
+ goto out;
+ }
+
+ nb = uaddr2taddr (nc, addr_buf);
+ if (!nb) {
+ err = -1;
+ goto out;
+ }
+
+ if (unregister_first) {
+ /* Force the unregistration of the program first.
+ * This call may fail if nothing has been registered,
+ * which is fine.
+ */
+ rpcsvc_program_unregister_rpcbind6 (newprog);
+ }
+
+ success = rpcb_set (newprog->prognum, newprog->progver, nc, nb);
+ if (!success) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6"
+ " service with rpcbind");
+ }
+
+ err = 0;
+
+out:
+ return err;
+}
+
+int
+rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog)
+{
+ int err = 0;
+ bool_t success = 0;
+ struct netconfig *nc;
+
+ if (!newprog) {
+ goto out;
+ }
+
+ nc = getnetconfigent ("tcp6");
+ if (!nc) {
+ err = -1;
+ goto out;
+ }
+
+ success = rpcb_unset (newprog->prognum, newprog->progver, nc);
+ if (!success) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6"
+ " service with rpcbind");
+ }
+
+ err = 0;
+out:
+ return err;
+}
+#endif
/* Register the program with the local portmapper service. */
int
@@ -1556,7 +1646,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program)
" program failed");
goto out;
}
-
+#ifdef IPV6_DEFAULT
+ ret = rpcsvc_program_unregister_rpcbind6 (program);
+ if (ret == -1) {
+ gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)"
+ " unregistration of program failed");
+ goto out;
+ }
+#endif
pthread_mutex_lock (&svc->rpclock);
{
list_for_each_entry (prog, &svc->programs, program) {
diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h
index 08402373be6..17e72482531 100644
--- a/rpc/rpc-lib/src/rpcsvc.h
+++ b/rpc/rpc-lib/src/rpcsvc.h
@@ -437,6 +437,13 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener);
extern int
rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port);
+#ifdef IPV6_DEFAULT
+extern int
+rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port, gf_boolean_t unregister_first);
+extern int
+rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog);
+#endif
+
extern int
rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog);
diff --git a/rpc/rpc-lib/src/xdr-common.h b/rpc/rpc-lib/src/xdr-common.h
index 596ac99640f..211e33272ba 100644
--- a/rpc/rpc-lib/src/xdr-common.h
+++ b/rpc/rpc-lib/src/xdr-common.h
@@ -105,4 +105,11 @@ unsigned long xdr_sizeof (xdrproc_t func, void *data);
#define xdr_decoded_length(xdr) (((size_t)(&xdr)->x_private) - ((size_t)(&xdr)->x_base))
+/*
+ * The TIRPC headers rather annoyingly don't define this, even though it
+ * actually exists.
+ */
+extern u_long xdr_sizeof (xdrproc_t freebsd_compiler_is_broken,
+ void *so_is_net_bsd);
+
#endif
diff --git a/rpc/rpc-transport/rdma/src/name.c b/rpc/rpc-transport/rdma/src/name.c
index 8003b1c87a0..b9d3269eb73 100644
--- a/rpc/rpc-transport/rdma/src/name.c
+++ b/rpc/rpc-transport/rdma/src/name.c
@@ -54,6 +54,10 @@ af_inet_bind_to_port_lt_ceiling (struct rdma_cm_id *cm_id,
struct sockaddr *sockaddr,
socklen_t sockaddr_len, uint32_t ceiling)
{
+#if defined(NO_PRIVPORT)
+ _assign_port(sockaddr, 0);
+ return rdma_bind_addr (cm_id, sockaddr);
+#else
int32_t ret = -1;
uint16_t port = ceiling - 1;
gf_boolean_t ports[GF_PORT_MAX];
@@ -100,6 +104,7 @@ loop:
}
return ret;
+#endif /* NO_PRIVPORT */
}
#if 0
diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c
index 0e34dc211fe..cab4161c076 100644
--- a/rpc/rpc-transport/socket/src/name.c
+++ b/rpc/rpc-transport/socket/src/name.c
@@ -42,6 +42,10 @@ static int32_t
af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr,
socklen_t sockaddr_len, uint32_t ceiling)
{
+#if defined(NO_PRIVPORT)
+ _assign_port(sockaddr, 0);
+ return bind (fd, sockaddr, sockaddr_len);
+#else
int32_t ret = -1;
uint16_t port = ceiling - 1;
gf_boolean_t ports[GF_PORT_MAX];
@@ -88,6 +92,7 @@ loop:
}
return ret;
+#endif /* NO_PRIVPORT */
}
static int32_t
@@ -557,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)
data_t *address_family_data = NULL;
int32_t ret = -1;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+ sa_family_t default_family = AF_INET6;
+#else
+ char *addr_family = "inet";
+ sa_family_t default_family = AF_INET;
+#endif
+
GF_VALIDATE_OR_GOTO ("socket", sa_family, out);
address_family_data = dict_get (this->options,
@@ -581,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)
}
} else {
gf_log (this->name, GF_LOG_DEBUG,
- "option address-family not specified, defaulting to inet");
- *sa_family = AF_INET;
+ "option address-family not specified, "
+ "defaulting to %s", addr_family);
+ *sa_family = default_family;
}
ret = 0;
diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c
index ae551dcfae7..40a25bdba83 100644
--- a/rpc/rpc-transport/socket/src/socket.c
+++ b/rpc/rpc-transport/socket/src/socket.c
@@ -38,6 +38,7 @@
#include <errno.h>
#include <rpc/xdr.h>
#include <sys/ioctl.h>
+
#define GF_LOG_ERRNO(errno) ((errno == ENOTCONN) ? GF_LOG_DEBUG : GF_LOG_ERROR)
#define SA(ptr) ((struct sockaddr *)ptr)
@@ -55,7 +56,11 @@
/* TBD: do automake substitutions etc. (ick) to set these. */
#if !defined(DEFAULT_ETC_SSL)
# ifdef GF_LINUX_HOST_OS
+# ifdef GF_FBEXTRAS
+# define DEFAULT_ETC_SSL "/var/lib/glusterd/ssl"
+# else
# define DEFAULT_ETC_SSL "/etc/ssl"
+# endif
# endif
# ifdef GF_BSD_HOST_OS
# define DEFAULT_ETC_SSL "/etc/openssl"
@@ -866,7 +871,7 @@ __socket_keepalive (int fd, int family, int keepalive_intvl,
goto err;
}
#else
- if (family != AF_INET)
+ if (family != AF_INET && family != AF_INET6)
goto done;
ret = setsockopt (fd, IPPROTO_TCP, TCP_KEEPIDLE, &keepalive_idle,
@@ -3009,6 +3014,21 @@ socket_connect (rpc_transport_t *this, int port)
}
}
+ /* Make sure we are not vulnerable to someone setting
+ * net.ipv6.bindv6only to 1 so that gluster services are
+ * avalable over IPv4 & IPv6.
+ */
+ int disable_v6only = 0;
+
+ if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY,
+ (void *)&disable_v6only,
+ sizeof (disable_v6only)) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Error disabling sockopt IPV6_V6ONLY: \"%s\"",
+ strerror (errno));
+ }
+
+
if (priv->nodelay && (sa_family != AF_UNIX)) {
ret = __socket_nodelay (priv->sock);
diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x
index 8462dcc258a..5ec8109d828 100644
--- a/rpc/xdr/src/glusterfs-fops.x
+++ b/rpc/xdr/src/glusterfs-fops.x
@@ -84,6 +84,7 @@ enum glusterfs_event_t {
GF_EVENT_UPCALL,
GF_EVENT_SCRUB_STATUS,
GF_EVENT_SOME_CHILD_DOWN,
+ GF_EVENT_CHILD_PING,
GF_EVENT_MAXVAL
};
diff --git a/run-tests.sh b/run-tests.sh
index 1487f30d832..866ab0464b4 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -182,12 +182,14 @@ function get_test_status ()
# for later. Why does the key have the distro and version then?
# Because changing the key in all test files would be very big process
# updating just this function with a better logic much simpler.
+ #
+ # FB Edit: For FB tests we are disabling NetBSD testing.
+ #
Linux)
result=$(grep -e "^#G_TESTDEF_TEST_STATUS_CENTOS6" $test_name | \
awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;;
NetBSD)
- result=$(grep -e "^#G_TESTDEF_TEST_STATUS_NETBSD7" $test_name | \
- awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;;
+ result="KNOWN_ISSUE" ;;
*)
result="ENABLED" ;;
esac
diff --git a/site.h.in b/site.h.in
new file mode 100644
index 00000000000..d917d78e59b
--- /dev/null
+++ b/site.h.in
@@ -0,0 +1,27 @@
+/*
+ * Guidelines for using this file vs. configure.ac
+ *
+ * (1) If it already exists in configure.ac, leave it there.
+ *
+ * (2) If it needs to take effect at configure (not compile) time, it *needs*
+ * to go in configure.ac.
+ *
+ * (3) If it affects file paths, which are the things most likely to be based
+ * on an OS or distribution's generic filesystem hierarchy and not on a
+ * particular package's definition (e.g. an RPM specfile), it should probably
+ * go in configure.ac.
+ *
+ * (4) If it affects default sizes, limits, thresholds, or modes of operation
+ * (e.g. IPv4 vs. IPv6), it should probably go here.
+ *
+ * (5) For anything else, is it more like the things in 3 or the things in 4?
+ * Which approach is more convenient for the people who are likely to use the
+ * new option(s)? Make your best guesses, confirm with others, and go with
+ * what works.
+ */
+
+/*
+ * This is just an example, and a way to check whether site.h is actually being
+ * included automatically.
+ */
+#define SITE_DOT_H_TEST 9987
diff --git a/test_env b/test_env
new file mode 100644
index 00000000000..2e6c33c9e6a
--- /dev/null
+++ b/test_env
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+DESIRED_TESTS="\
+ tests/basic/*.t\
+ tests/basic/afr/*.t\
+ tests/basic/distribute/*.t\
+ tests/features/brick-min-free-space.t\
+"
+
+KNOWN_FLAKY_TESTS="\
+ tests/bugs/glusterd/bug-1173414-mgmt-v3-remote-lock-failure.t\
+ tests/bugs/glusterd/bug-1420637-volume-sync-fix.t\
+ tests/bugs/glusterd/bug-1104642.t\
+ tests/bugs/glusterd/bug-1022055.t\
+ tests/bugs/glusterd/bug-1293414-import-brickinfo-uuid.t\
+ tests/bugs/transport/bug-873367.t\
+ tests/bugs/ec/bug-1161621.t\
+ tests/bugs/quota/bug-1287996.t\
+ tests/bugs/fb8149516.t\
+ tests/bugs/posix/bug-990028.t\
+ tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t\
+ tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t\
+ tests/bugs/write-behind/bug-1279730.t\
+ tests/bugs/cli/bug-1320388.t\
+ tests/bugs/snapshot/bug-1112613.t\
+ tests/bugs/snapshot/bug-1087203.t\
+ tests/bugs/snapshot/bug-1202436-calculate-quota-cksum-during-snap-restore.t\
+ tests/bugs/snapshot/bug-1205592.t\
+ tests/bugs/snapshot/bug-1140162-file-snapshot-features-encrypt-opts-validation.t\
+ tests/bugs/glusterd/bug-1231437-rebalance-test-in-cluster.t\
+ tests/bugs/snapshot/bug-1049834.t\
+ tests/bugs/shard/zero-flag.t\
+ tests/bugs/bitrot/1207029-bitrot-daemon-should-start-on-valid-node.t\
+ tests/bugs/bitrot/1209752-volume-status-should-show-bitrot-scrub-info.t\
+ tests/bugs/snapshot/bug-1399598-uss-with-ssl.t\
+ tests/bugs/tier/bug-1279376-rename-demoted-file.t\
+ tests/bugs/tier/bug-1286974.t\
+ tests/bugs/tier/bug-1205545-CTR-and-trash-integration.t\
+ tests/features/ipc.t\
+ tests/features/ssl-authz.t\
+ tests/bugs/glusterd/bug-948686.t\
+ tests/bugs/core/bug-986429.t\
+ tests/bugs/fb4482137.t\
+ tests/bugs/glusterd/bug-913555.t\
+ tests/basic/rpm.t\
+ tests/basic/accept-v6v4.t\
+ tests/basic/afr/granular-esh/granular-esh.t\
+ tests/basic/afr/granular-esh/cli.t\
+ tests/basic/afr/granular-esh/granular-indices-but-non-granular-heal.t\
+ tests/basic/afr/granular-esh/conservative-merge.t\
+ tests/basic/afr/granular-esh/add-brick.t\
+ tests/basic/afr/granular-esh/replace-brick.t\
+ tests/basic/bd.t tests/basic/uss.t\
+ tests/basic/glusterd/arbiter-volume-probe.t\
+ tests/basic/meta.t\
+ tests/basic/gfapi/bug1291259.t\
+ tests/basic/gfapi/gfapi-ssl-test.t\
+ tests/basic/tier/bug-1214222-directories_missing_after_attach_tier.t\
+ tests/basic/tier/record-metadata-heat.t\
+ tests/basic/tier/readdir-during-migration.t\
+ tests/basic/fops-sanity-gfproxy.t\
+ tests/basic/tier/ctr-rename-overwrite.t\
+ tests/basic/tier/frequency-counters.t\
+ tests/basic/mgmt_v3-locks.t\
+ tests/basic/tier/file_with_spaces.t\
+ tests/basic/tier/tier_lookup_heal.t\
+ tests/basic/glusterd/volfile_server_switch.t\
+ tests/basic/tier/tier-file-create.t\
+ tests/basic/tier/locked_file_migration.t\
+ tests/basic/tier/tier-snapshot.t\
+ tests/basic/volume-snapshot.t\
+ tests/basic/tier/new-tier-cmds.t\
+ tests/basic/quota-nfs.t\
+ tests/geo-rep/georep-basic-dr-rsync.t\
+ tests/basic/tier/unlink-during-migration.t\
+ tests/basic/tier/fops-during-migration-pause.t\
+ tests/basic/volume-snapshot-clone.t\
+ tests/bugs/nfs/bug-1166862.t\
+ tests/basic/tier/legacy-many.t\
+ tests/bugs/nfs/bug-1116503.t\
+ tests/bugs/nfs/bug-904065.t\
+ tests/bugs/rpc/bug-921072.t\
+ tests/bugs/rpc/bug-847624.t\
+ tests/bugs/glusterfs-server/bug-904300.t\
+ tests/bugs/replicate/886998/strict-readdir.t\
+ tests/basic/tier/fops-during-migration.t\
+ tests/basic/tier/tierd_check.t\
+ tests/basic/tier/tier.t\
+ tests/bugs/replicate/bug-1250170-fsync.t\
+ tests/basic/cache.t\
+ tests/geo-rep/georep-basic-dr-tarssh.t\
+ tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t\
+ tests/bitrot/bug-1294786.t\
+ tests/bugs/quick-read/bug-846240.t\
+ tests/bugs/quota/afr-quota-xattr-mdata-heal.t\
+ tests/bugs/quota/bug-1288474.t\
+ tests/bugs/glusterd/bug-1344407-volume-delete-on-node-down.t\
+ tests/bugs/glusterd/859927/repl.t\
+ tests/bugs/glusterd/bug-1238706-daemons-stop-on-peer-cleanup.t\
+ tests/bugs/replicate/bug-1290965-detect-bitrotten-objects.t\
+ tests/bugs/glusterd/bug-1303028-Rebalance-glusterd-rpc-connection-issue.t\
+ tests/bugs/replicate/bug-859581.t\
+ tests/bugs/glusterd/bug-1047955.t\
+ tests/bugs/glusterd/bug-1213295-snapd-svc-uninitialized.t\
+ tests/bugs/glusterd/bug-1260185-donot-allow-detach-commit-unnecessarily.t\
+ tests/bugs/glusterd/bug-1230121-replica_subvol_count_correct_cal.t\
+ tests/bugs/glusterd/bug-1245045-remove-brick-validation.t\
+ tests/bugs/glusterd/bug-948729/bug-948729.t\
+ tests/bugs/glusterd/bug-948729/bug-948729-force.t\
+ tests/bugs/glusterd/bug-948729/bug-948729-mode-script.t\
+ tests/bugs/glusterd/bug-964059.t\
+ tests/bugs/glusterd/bug-888752.t\
+ tests/bugs/glusterd/bug-1177132-quorum-validation.t\
+ tests/bugs/glusterd/bug-889630.t\
+ tests/bugs/glusterd/bug-857330/xml.t\
+ tests/bugs/glusterd/bug-857330/normal.t
+ tests/bugs/glusterd/bug-1367478-volume-start-validation-after-glusterd-restart.t\
+ tests/bugs/glusterd/bug-1223213-peerid-fix.t\
+ tests/bugs/glusterd/bug-1245142-rebalance_test.t\
+ tests/bugs/glusterd/bug-1091935-brick-order-check-from-cli-to-glusterd.t\
+ tests/bugs/glusterd/bug-1323287-real_path-handshake-test.t\
+ tests/bugs/glusterd/bug-1266818-shared-storage-disable.t\
+ tests/bugs/replicate/bug-802417.t\
+ tests/bugs/glusterd/bug-1173414-mgmt-v3-remote-lock-failure.t\
+ tests/bugs/glusterd/bug-1420637-volume-sync-fix.t\
+ tests/bugs/glusterd/bug-1104642.t\
+ tests/bugs/glusterd/bug-1293414-import-brickinfo-uuid.t\
+ tests/bugs/glusterd/bug-1022055.t\
+ tests/bugs/transport/bug-873367.t\
+ tests/bugs/quota/bug-1287996.t\
+ tests/bugs/fb8149516.t\
+ tests/bugs/posix/bug-990028.t\
+ tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t\
+ tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t\
+ tests/bugs/write-behind/bug-1279730.t\
+ tests/bugs/cli/bug-1320388.t\
+ tests/bugs/ec/bug-1304988.t\
+ tests/bugs/glusterd/bug-948686.t\
+ tests/bugs/snapshot/bug-1112613.t\
+ tests/bugs/snapshot/bug-1087203.t\
+ tests/bugs/glusterd/bug-913555.t\
+ tests/bugs/snapshot/bug-1202436-calculate-quota-cksum-during-snap-restore.t\
+ tests/bugs/snapshot/bug-1140162-file-snapshot-features-encrypt-opts-validation.t\
+ tests/bugs/snapshot/bug-1205592.t tests/bugs/glusterd/bug-1231437-rebalance-test-in-cluster.t\
+ tests/bugs/snapshot/bug-1227646.t tests/bugs/shard/zero-flag.t\
+ tests/bugs/snapshot/bug-1399598-uss-with-ssl.t\
+ tests/bugs/snapshot/bug-1049834.t\
+ tests/bugs/core/bug-986429.t\
+ tests/bugs/bitrot/1207029-bitrot-daemon-should-start-on-valid-node.t\
+ tests/bugs/bitrot/1209752-volume-status-should-show-bitrot-scrub-info.t\
+ tests/bugs/fb4482137.t\
+ tests/bugs/tier/bug-1205545-CTR-and-trash-integration.t\
+ tests/features/ipc.t\
+ tests/bugs/tier/bug-1286974.t\
+ tests/bugs/tier/bug-1279376-rename-demoted-file.t\
+ tests/features/ssl-authz.t\
+ tests/bugs/glusterd/bug-857330/normal.t\
+ tests/bugs/distribute/bug-862967.t\
+ tests/basic/quota-anon-fd-nfs.t\
+ tests/basic/rpc-coverage.t\
+ tests/basic/afr/gfid-mismatch.t\
+"
+
+DESIRED_TESTS=$(echo $DESIRED_TESTS | tr -s ' ' ' ')
+KNOWN_FLAKY_TESTS=$(echo $KNOWN_FLAKY_TESTS | tr -s ' ' ' ')
diff --git a/tests/basic/accept-v6v4.t b/tests/basic/accept-v6v4.t
new file mode 100755
index 00000000000..ce3a1bae7f9
--- /dev/null
+++ b/tests/basic/accept-v6v4.t
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+. $(dirname $0)/../nfs.rc
+
+#
+# This test ensures that GlusterFS provides NFS, Mount and its Management daemon
+# over both IPv4 and IPv6. It uses netcat to check the services running on both
+# IPv4 & IPv6 addresses as well as a mount to test that mount & nfs work.
+#
+
+IPV4_SUPPORT=false
+IPV6_SUPPORT=false
+
+host $HOSTNAME | grep -q "has address" && IPV4_SUPPORT=true
+host $HOSTNAME | grep -q "has IPv6 address" && IPV6_SUPPORT=true
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+mkdir -p $B0/b{0,1,2}
+
+# make sure no registered rpcbind services are running
+service rpcbind restart
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI vol create $V0 replica 3 $H0:$B0/b0 $H0:$B0/b1 $H0:$B0/b2
+
+TEST $CLI vol set $V0 cluster.self-heal-daemon off
+TEST $CLI vol set $V0 nfs.disable off
+TEST $CLI vol set $V0 cluster.choose-local off
+TEST $CLI vol start $V0
+
+MOUNTD_PORT=38465
+MGMTD_PORT=24007
+NFSD_PORT=2049
+
+function check_ip_port {
+ ip=$1
+ port=$2
+ type=$3
+
+ nc_flags=""
+ if [ "$type" == "v6" ] && [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ else
+ nc_flags="-6"
+ fi
+
+ if [ "$type" == "v4" ] && [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ fi
+
+ if exec 3<>/dev/tcp/$ip/$port; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+function check_nfs {
+ ip=$1
+ type=$2
+
+ if [ "$ip" == "NONE" ]; then
+ echo "Y"
+ return
+ fi
+
+ if [ "$type" == "v6" ]; then
+ addr="[$ip]"
+ else
+ addr="$ip"
+ fi
+
+ if mount_nfs $addr:/$V0 $N0; then
+ umount_nfs $N0
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+if [ ! $IPV4_SUPPORT ] && [ ! $IPV6_SUPPORT ]; then
+ exit 1
+fi
+
+# Get the V4 & V6 addresses of this host
+if $IPV4_SUPPORT; then
+ V4=$(host $HOSTNAME | head -n1 | awk -F ' ' '{print $4}')
+else
+ V4="NONE"
+fi
+
+if $IPV6_SUPPORT; then
+ V6=$(host $HOSTNAME | tail -n1 | awk -F ' ' '{print $5}')
+else
+ V6="NONE"
+fi
+
+# First check the management daemon
+EXPECT "Y" check_ip_port $V6 $MGMTD_PORT "v6"
+EXPECT "Y" check_ip_port $V4 $MGMTD_PORT "v4"
+
+# Give the MOUNT/NFS Daemon some time to start up
+sleep 4
+
+EXPECT "Y" check_ip_port $V4 $MOUNTD_PORT "v6"
+EXPECT "Y" check_ip_port $V6 $MOUNTD_PORT "v4"
+
+EXPECT "Y" check_ip_port $V4 $NFSD_PORT "v6"
+EXPECT "Y" check_ip_port $V6 $NFSD_PORT "v4"
+
+# Mount the file system
+EXPECT "Y" check_nfs $V6 "v6"
+EXPECT "Y" check_nfs $V4 "v4"
+
+# Test a rpcbind crash
+pkill -9 rpcbind && service rpcbind start
+sleep 15
+
+# Test that the port re-registered
+rpcinfo=$(rpcinfo -s | grep nfs | grep -v nfs_acl)
+
+function check_rpcinfo {
+ support=$1
+ type=$2
+
+ if [ ! $support ]; then
+ echo "Y"
+ return
+ fi
+
+ if [ "$type" == "v6" ]; then
+ echo $(echo $rpcinfo | grep tcp6 && echo "Y" || echo "N")
+ else
+ echo $(echo $rpcinfo | grep tcp && echo "Y" || echo "N")
+ fi
+}
+
+EXPECT "Y" check_rpcinfo $IPV4_SUPPORT "v4"
+EXPECT "Y" check_rpcinfo $IPV6_SUPPORT "v6"
+
+cleanup;
diff --git a/tests/basic/afr/gfid-unsplit-shd.t b/tests/basic/afr/gfid-unsplit-shd.t
new file mode 100644
index 00000000000..77da5243724
--- /dev/null
+++ b/tests/basic/afr/gfid-unsplit-shd.t
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 nfs.disable off
+TEST $CLI volume set $V0 cluster.quorum-type none
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority off
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+#EST $CLI volume set $V0 cluster.favorite-child-by-size off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+# Part I: FUSE Test
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+cd $M0
+mkdir foo
+dd if=/dev/urandom of=foo/splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum foo/splitfile | cut -d\ -f1)
+
+sleep 1
+cd ~
+
+GFID_PARENT_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_PARENT_FORMATTED=$(echo "$GFID_PARENT_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo/splitfile 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')"
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+
+# For good measure kill the first brick so the inode cache is wiped, we don't
+# want any funny business
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST $CLI volume start $V0 force
+pkill -f gluster/glustershd
+
+rm -f $GFID_LINK_B1
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/foo/splitfile
+sleep 1
+TEST touch $B0/${V0}1/foo/splitfile
+
+mkdir -p $B0/${V0}1/.glusterfs/fd/55
+ln $B0/${V0}1/foo/splitfile $B0/${V0}1/.glusterfs/fd/55/fd551a5c-fddd-4c1a-a4d0-96ef09ef5c08
+cd ~
+
+touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_FORMATTED
+touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+
+EXPECT_WITHIN 60 "0" get_pending_heal_count $V0
+
+TEST stat $B0/${V0}1/foo/splitfile
+
+cd $M0
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat foo/splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=foo/splitfile of=/dev/null 2>/dev/null
+
+# Verify entry healing happened on the back-end regardless of the
+# gfid-splitbrain state of the directory.
+TEST stat $B0/${V0}1/foo/splitfile
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum foo/splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f foo/splitfile
+cd ~
+
+cleanup
diff --git a/tests/basic/afr/gfid-unsplit-type-mismatch.t b/tests/basic/afr/gfid-unsplit-type-mismatch.t
new file mode 100644
index 00000000000..9e205021a0d
--- /dev/null
+++ b/tests/basic/afr/gfid-unsplit-type-mismatch.t
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 nfs.disable on
+TEST $CLI volume set $V0 cluster.quorum-type none
+TEST $CLI volume set $V0 cluster.favorite-child-policy mtime
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+pkill -f gluster/glustershd
+
+# Part I: FUSE Test
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+cd $M0
+dd if=/dev/urandom of=splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum splitfile | cut -d\ -f1)
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+rm -rf $GFID_DIR_B1
+rm -fv $B0/${V0}1/splitfile
+
+# Now really screw the file up, by changing it's type to a directory
+# not a file...the so-called "type mismatch" situation. Our test
+# should prove we can un-mangle this situation using the same strategy.
+mkdir $B0/${V0}1/splitfile
+touch -t 199011011510 $B0/${V0}1/splitfile
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile
+cd ~
+
+touch $M0/newfile
+
+# Synthetically force a conservative merge of the directory. We want
+# to ensure that conservative merges happen in-spite of GFID mis-matches,
+# since we can handle them there's no sense in not doing these. In fact,
+# if we stop them it will block GFID split-brain resolution.
+setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1
+setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1
+
+# Restart the down brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+cd $M0
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=splitfile of=/dev/null 2>/dev/null
+# Verify entry healing happened on the back-end regardless of the
+# gfid-splitbrain state of the directory.
+TEST stat $B0/${V0}1/splitfile
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f splitfile
+cd ~
+
+cleanup
diff --git a/tests/basic/afr/gfid-unsplit.t b/tests/basic/afr/gfid-unsplit.t
new file mode 100644
index 00000000000..0b883ab658f
--- /dev/null
+++ b/tests/basic/afr/gfid-unsplit.t
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../nfs.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.quorum-type none
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 nfs.disable off
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.favorite-child-policy mtime
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+# Part I: FUSE Test
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+dd if=/dev/urandom of=$M0/splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum $M0/splitfile | cut -d\ -f1)
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+rm -rf $GFID_DIR_B1
+mkdir -p $B0/${V0}1/.glusterfs/fd/55
+ln $B0/${V0}1/splitfile $B0/${V0}1/.glusterfs/fd/55/fd551a5c-fddd-4c1a-a4d0-96ef09ef5c08
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile
+
+GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+#EST rm -f $B0/${V0}3/splitfile
+#m -rf $GFID_DIR_B3
+
+touch $M0/newfile
+
+# Synthetically force a conservative merge of the directory. We want
+# to ensure that conservative merges happen in-spite of GFID mis-matches,
+# since we can handle them there's no sense in not doing these. In fact,
+# if we stop them it will block GFID split-brain resolution.
+setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1
+setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1
+
+# Restart the down brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat $M0/splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null
+
+# Verify entry healing happened on the back-end regardless of the
+# gfid-splitbrain state of the directory.
+TEST stat $B0/${V0}1/splitfile
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum $M0/splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f $M0/splitfile
+
+# Part II: NFS test
+TEST mount_nfs $H0:/$V0 $N0 nolock
+#EST mount -t nfs -o nolock,noatime,noacl,soft,intr $H0:/$V0 $N0;
+
+dd if=/dev/urandom of=$N0/splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum $N0/splitfile | cut -d\ -f1)
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+rm -rf $GFID_DIR_B1
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile
+
+GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+#EST rm -f $B0/${V0}3/splitfile
+#m -rf $GFID_DIR_B3
+
+# Restart the down brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat $N0/splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=$N0/splitfile of=/dev/null 2>/dev/null
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum $N0/splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f $N0/splitfile
+
+cleanup
diff --git a/tests/basic/afr/metadata-self-heal.t b/tests/basic/afr/metadata-self-heal.t
index b88c16a93e1..45bae7bdbfc 100644
--- a/tests/basic/afr/metadata-self-heal.t
+++ b/tests/basic/afr/metadata-self-heal.t
@@ -50,6 +50,7 @@ function print_pending_heals {
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume start $V0
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
cd $M0
diff --git a/tests/basic/afr/self-heal.t b/tests/basic/afr/self-heal.t
index e1ac17c2d79..f2af52d9773 100644
--- a/tests/basic/afr/self-heal.t
+++ b/tests/basic/afr/self-heal.t
@@ -194,13 +194,22 @@ TEST rm -rf $M0/*
#7. Link/symlink heal
+# Make links (especially symlinks) with relative paths instead of absolute
+# paths, because absolute paths pointing from the brick to the mountpoint have
+# caused problems.
+make_link () {
+ mountpoint=$1; shift
+ # Do this in a subshell so we don't change "cd -" for the parent.
+ (cd $mountpoint; ln $*)
+}
+
#Test
TEST touch $M0/file
-TEST ln $M0/file $M0/link_to_file
+TEST make_link $M0 file link_to_file
TEST kill_brick $V0 $H0 $B0/brick0
TEST rm -f $M0/link_to_file
-TEST ln -s $M0/file $M0/link_to_file
-TEST ln $M0/file $M0/hard_link_to_file
+TEST make_link $M0 file -s link_to_file
+TEST make_link $M0 file hard_link_to_file
TEST $CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
diff --git a/tests/basic/afr/shd-autofix-nogfid.t b/tests/basic/afr/shd-autofix-nogfid.t
new file mode 100644
index 00000000000..7c9026dce62
--- /dev/null
+++ b/tests/basic/afr/shd-autofix-nogfid.t
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 nfs.disable on
+TEST $CLI volume set $V0 cluster.quorum-type auto
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+# Kill the SHD while we setup the test
+pkill -f gluster/glustershd
+TEST kill_brick $V0 $H0 $B0/${V0}1
+
+mkdir $M0/foo
+dd if=/dev/urandom of=$M0/foo/testfile bs=128k count=5 2>/dev/null
+MD5=$(md5sum $M0/foo/testfile | cut -d\ -f1)
+
+mkdir $B0/${V0}1/foo
+
+# Kick off the SHD and wait 30 seconds for healing to take place
+TEST gluster vol start $V0 force
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+
+# Verify the file was healed back to brick 1
+TEST stat $B0/${V0}1/foo/testfile
+
+# Part II: Test recovery for a file without a GFID
+# Kill the SHD while we setup the test
+pkill -f gluster/glustershd
+TEST kill_brick $V0 $H0 $B0/${V0}1
+rm -f $GFID_LINK_B1
+rm -f $B0/${V0}1/foo/testfile
+touch $B0/${V0}1/foo/testfile
+
+# Queue the directories for healing, don't bother the queue the file
+# as this shouldn't be required.
+touch $B0/${V0}3/.glusterfs/indices/xattrop/00000000-0000-0000-0000-000000000001
+touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED
+
+TEST gluster vol start $V0 force
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+TEST stat $B0/${V0}1/foo/testfile
+
+# Prove the directory and file are removable
+TEST rm -f $B0/${V0}1/foo/testfile
+TEST rmdir $B0/${V0}1/foo
+
+cleanup
diff --git a/tests/basic/afr/shd-force-inspect.t b/tests/basic/afr/shd-force-inspect.t
new file mode 100644
index 00000000000..caceb841322
--- /dev/null
+++ b/tests/basic/afr/shd-force-inspect.t
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 nfs.disable on
+TEST $CLI volume set $V0 cluster.quorum-type none
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+# Part I: FUSE Test
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+cd $M0
+mkdir foo
+dd if=/dev/urandom of=foo/testfile bs=128k count=5 2>/dev/null
+MD5=$(md5sum foo/testfile | cut -d\ -f1)
+
+# Kill the SHD while we setup the test
+pkill -f gluster/glustershd
+
+# Grab the GFID of the file and parent dir
+GFID_PARENT_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_PARENT_FORMATTED=$(echo "$GFID_PARENT_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo/testfile 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')"
+
+# Nuke the file from brick 1
+rm -f $GFID_LINK_B1
+rm -f $B0/${V0}1/foo/testfile
+
+# Now manually queue up the parent directory for healing
+touch $B0/${V0}2/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED
+touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED
+
+# Kick off the SHD and wait 30 seconds for healing to take place
+TEST gluster vol start patchy force
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+
+# Verify the file was healed back to brick 1
+TEST stat $B0/${V0}1/foo/testfile
+
+cleanup
diff --git a/tests/basic/afr/shd-pgfid-heal.t b/tests/basic/afr/shd-pgfid-heal.t
new file mode 100644
index 00000000000..6213e4c6374
--- /dev/null
+++ b/tests/basic/afr/shd-pgfid-heal.t
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 nfs.disable on
+TEST $CLI volume set $V0 cluster.quorum-type none
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.pgfid-self-heal on
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+TEST $CLI volume set $V0 storage.build-pgfid on
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+cd $M0
+mkdir -p a/b/c
+dd if=/dev/urandom of=a/b/c/testfile bs=128k count=5 2>/dev/null
+
+# Kill the SHD while we setup the test
+pkill -f gluster/glustershd
+# Kill the brick as well such that
+TEST kill_brick $V0 $H0 $B0/${V0}1
+
+echo stuff >> $M0/a/b/c/testfile
+MD5=$(md5sum a/b/c/testfile | cut -d\ -f1)
+
+# Grab the GFID of the file and parent dir
+GFID_PARENT_B_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_PARENT_B_FORMATTED=$(echo "$GFID_PARENT_B_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_PARENT_B_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_PARENT_B_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')"
+GFID_PARENT_C_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b/c 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_PARENT_C_FORMATTED=$(echo "$GFID_PARENT_C_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_PARENT_C_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_PARENT_C_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')"
+GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b/c/testfile 2>/dev/null | grep trusted.gfid | cut -d= -f2)
+GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')
+GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')"
+
+#
+# Here we are going to create a situation such that a file 3
+# levels deep into the FS requires healing, along with 2 levels
+# of parent directories. The only signal SHD has is that the
+# file itself needs healing. The directory (entry) heals are
+# missing; simulating a crash or some sort of bug that we need
+# to be able to recover from.
+#
+
+# Nuke the file from brick 1, along with the parent directories
+# and all backend hard/symbolic links
+rm -f $B0/${V0}1/a/b/c/testfile
+rm -f $GFID_LINK_B1
+rmdir $B0/${V0}1/a/b/c
+rm -f $GFID_PARENT_C_LINK_B1
+rmdir $B0/${V0}1/a/b
+rm -f $GFID_PARENT_B_LINK_B1
+
+# Kick off the SHD and wait 30 seconds for healing to take place
+TEST gluster vol start patchy force
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+sleep 5
+
+# Verify the file was healed back to brick 1
+TEST stat $B0/${V0}1/a/b/c/testfile
+
+cleanup
diff --git a/tests/basic/bd.t b/tests/basic/bd.t
index 63622edd709..11582db81c0 100755
--- a/tests/basic/bd.t
+++ b/tests/basic/bd.t
@@ -86,6 +86,7 @@ TEST pidof glusterd
configure
TEST $CLI volume create $V0 ${H0}:/$B0/$V0?${V0}
+TEST $CLI volume set $V0 performance.stat-prefetch off
EXPECT "$V0" volinfo_field $V0 'Volume Name';
EXPECT 'Created' volinfo_field $V0 'Status';
diff --git a/tests/basic/cache.t b/tests/basic/cache.t
new file mode 100644
index 00000000000..92251732f4a
--- /dev/null
+++ b/tests/basic/cache.t
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+
+FILE=/var/log/glusterfs/samples/glusterfs_patchy.samp
+rm $FILE
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function print_cnt() {
+ local FOP_TYPE=$1
+ local FOP_CNT=$(grep ,${FOP_TYPE} $FILE | wc -l)
+ echo $FOP_CNT
+}
+
+function print_avg() {
+ local FOP_TYPE=$1
+ local FILE=/var/log/glusterfs/samples/glusterfs_patchy.samp
+ local FOP_AVG=$(grep -oE "${FOP_TYPE},[0-9]+\." ${FILE} | grep -oE '[0-9]+' | awk 'NR == 1 { sum = 0 } { sum += $1; } END {printf "%d", sum/NR}')
+ echo $FOP_AVG
+}
+
+cleanup;
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 nfs.disable off
+TEST $CLI volume set $V0 diagnostics.latency-measurement on
+TEST $CLI volume set $V0 diagnostics.count-fop-hits on
+TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535
+TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1
+TEST $CLI volume set $V0 diagnostics.stats-dump-interval 1
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+for i in {1..100}
+do
+ df $M0 &> /dev/null
+done
+
+sleep 6
+
+# Get average
+STATFS_CNT0=$(print_cnt STATFS)
+TEST [ "$STATFS_CNT0" -gt "0" ]
+STATFS_AVG0=$(print_avg STATFS)
+# Make it easier to compute averages
+rm $FILE
+
+TEST $CLI volume set $V0 performance.nfs.io-cache on
+TEST $CLI volume set $V0 performance.statfs-cache on
+TEST $CLI volume set $V0 performance.statfs-cache-timeout 10
+
+for i in {1..100}
+do
+ df $M0 &> /dev/null
+done
+
+sleep 6
+
+# Get average
+STATFS_CNT1=$(print_cnt STATFS)
+TEST [ "$STATFS_CNT1" -eq "$STATFS_CNT0" ]
+STATFS_AVG1=$(print_avg STATFS)
+
+# Verify that cached average * 10 is still faster than uncached
+STATFS_AVG1x10=$(($STATFS_AVG1 * 10))
+TEST [ "$STATFS_AVG0" -gt "$STATFS_AVG1x10" ]
+#cleanup
diff --git a/tests/basic/dht-min-free-space.t b/tests/basic/dht-min-free-space.t
new file mode 100755
index 00000000000..9553f9247aa
--- /dev/null
+++ b/tests/basic/dht-min-free-space.t
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../traps.rc
+
+grep $B0/patchy1 /proc/mounts &> /dev/null && umount $B0/patchy1
+grep $B0/patchy2 /proc/mounts &> /dev/null && umount $B0/patchy2
+mkdir $B0/${V0}{1..2}
+
+TEST glusterd
+
+TEST truncate --size $((30*1048576)) $B0/${V0}-dev1
+push_trapfunc "rm -f $B0/${V0}-dev1"
+TEST truncate --size $((30*1048576)) $B0/${V0}-dev2
+push_trapfunc "rm -f $B0/${V0}-dev2"
+
+TEST mkfs.xfs $B0/${V0}-dev1
+TEST mkfs.xfs $B0/${V0}-dev2
+
+TEST mount -o loop $B0/${V0}-dev1 $B0/${V0}1
+TEST mount -o loop $B0/${V0}-dev2 $B0/${V0}2
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}2
+TEST $CLI volume set $V0 cluster.min-free-disk 2MB
+TEST $CLI volume set $V0 cluster.min-free-strict-mode on
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 0
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+####################################
+# Test re-directs of file creation #
+####################################
+
+# This should work, no redirects
+TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=8
+TEST [ -f /d/backends/${V0}2/testfile1 ] && [ ! -k /d/backends/${V0}1/testfile1 ]
+
+TEST $CLI volume set $V0 cluster.min-free-disk 19MB
+
+# This should work, & the file redirected
+# Subvolume 2 should have the linkto &
+# Subvolume 1 should have the original
+TEST dd if=/dev/zero of=$M0/testfile3 bs=1M count=4
+TEST [ -f /d/backends/${V0}1/testfile3 ] && [ ! -k /d/backends/${V0}1/testfile3 ]
+TEST [ -k /d/backends/${V0}2/testfile3 ]
+
+# This should fail, cluster is full
+TEST ! dd if=/dev/zero of=$M0/testfile2 bs=1M count=23
+
+###################
+# Strict mode off #
+###################
+TEST $CLI volume set $V0 cluster.min-free-strict-mode off
+TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=20
+TEST rm -f $M0/testfile1
+
+###################
+# Strict mode on #
+###################
+TEST $CLI volume set $V0 cluster.min-free-strict-mode on
+TEST ! dd if=/dev/zero of=$M0/testfile1 bs=1M count=16
+TEST rm -f $M0/testfile1
+
+# Cleanup will deal with our mounts for us, and (because we used "-o loop") our
+# device files too, but not the underlying files. That will happen in the EXIT
+# trap handler instead.
+cleanup;
diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common
index 83c4463a912..152e3b51236 100644
--- a/tests/basic/ec/ec-common
+++ b/tests/basic/ec/ec-common
@@ -45,7 +45,7 @@ for size in $SIZE_LIST; do
eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }')
done
-TEST df -h
+TEST df -h $M0
TEST stat $M0
for idx in `seq 0 $LAST_BRICK`; do
diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t
index 98dd9232c73..3e3467535fb 100644
--- a/tests/basic/ec/self-heal.t
+++ b/tests/basic/ec/self-heal.t
@@ -136,7 +136,7 @@ TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024
cs=$(sha1sum $tmp/test | awk '{ print $1 }')
-TEST df -h
+TEST df -h $M0
TEST stat $M0
for idx in {0..5}; do
diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t
index fdaf9c2822e..da88bbcb2cc 100644
--- a/tests/basic/exports_parsing.t
+++ b/tests/basic/exports_parsing.t
@@ -32,7 +32,20 @@ function test_bad_opt ()
glusterfsd --print-exports $1 2>&1 | sed -n 1p
}
-EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports
+function check_export_line() {
+ if [ "$1" == "$2" ]; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+ return
+}
+
+export_result=$(test_good_file $EXP_FILES/exports)
+EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result"
+
+export_result=$(test_good_file $EXP_FILES/exports-v6)
+EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result"
EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports
EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports
diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t
index cea8aa737c0..713c7e27579 100644
--- a/tests/basic/fop-sampling.t
+++ b/tests/basic/fop-sampling.t
@@ -2,13 +2,27 @@
#
. $(dirname $0)/../include.rc
+. $(dirname $0)/../nfs.rc
. $(dirname $0)/../volume.rc
-SAMPLE_FILE="$(gluster --print-logdir)/samples/glusterfs_${V0}.samp"
+BRICK_SAMPLES="$(gluster --print-logdir)/samples/glusterfsd__d_backends_${V0}0.samp"
+NFS_SAMPLES="$(gluster --print-logdir)/samples/glusterfs_nfsd.samp"
+
+function check_path {
+ op=$1
+ path=$2
+ file=$3
+ grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null
+ if [ $? -eq 0 ]; then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
function print_cnt() {
local FOP_TYPE=$1
- local FOP_CNT=$(grep ,${FOP_TYPE} ${SAMPLE_FILE} | wc -l)
+ local FOP_CNT=$(grep ,${FOP_TYPE} ${BRICK_SAMPLES} | wc -l)
echo $FOP_CNT
}
@@ -42,12 +56,18 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume set $V0 nfs.disable off
TEST $CLI volume set $V0 diagnostics.latency-measurement on
TEST $CLI volume set $V0 diagnostics.count-fop-hits on
-TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2
+TEST $CLI volume set $V0 diagnostics.stats-dump-interval 5
TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535
TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1
TEST $CLI volume set $V0 diagnostics.stats-dnscache-ttl-sec 3600
-
TEST $CLI volume start $V0
+
+>${NFS_SAMPLES}
+>${BRICK_SAMPLES}
+
+#################
+# Basic Samples #
+#################
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
for i in {1..5}
@@ -58,4 +78,52 @@ done
TEST ls -l $M0
EXPECT_WITHIN 6 "OK" check_samples
-cleanup
+sleep 2
+
+################################
+# Paths in the samples #
+################################
+
+TEST mount_nfs $H0:$V0 $N0
+
+ls $N0 &> /dev/null
+touch $N0/file1
+stat $N0/file1 &> /dev/null
+echo "some data" > $N0/file1
+dd if=/dev/zero of=$N0/file2 bs=1M count=10 conv=fsync
+dd if=/dev/zero of=$N0/file1 bs=1M count=1
+cat $N0/file2 &> /dev/null
+mkdir -p $N0/dir1
+rmdir $N0/dir1
+rm $N0/file1
+rm $N0/file2
+
+EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FINODELK /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ENTRYLK / $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $BRICK_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $BRICK_SAMPLES
+
+
+EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ACCESS /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path ACCESS /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path READ /file2 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $NFS_SAMPLES
+EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $NFS_SAMPLES
+
+cleanup;
diff --git a/tests/basic/fops-sanity-gfproxy.t b/tests/basic/fops-sanity-gfproxy.t
new file mode 100755
index 00000000000..b3bb8a502cc
--- /dev/null
+++ b/tests/basic/fops-sanity-gfproxy.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+TEST $CLI volume create $V0 $H0:$B0/brick1;
+EXPECT 'Created' volinfo_field $V0 'Status';
+
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+
+#gfproxy server
+TEST glusterfs --volfile-id=gfproxy/$V0 --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log
+
+#mount on a random dir
+TEST glusterfs --entry-timeout=3600 --attribute-timeout=3600 -s $H0 --volfile-id=gfproxy-client/$V0 $M0 --direct-io-mode=yes
+TEST grep gfproxy-client /proc/mounts
+
+build_tester $(dirname $0)/fops-sanity.c
+
+TEST cp $(dirname $0)/fops-sanity $M0
+cd $M0
+TEST ./fops-sanity $V0
+cd -
+rm -f $(dirname $0)/fops-sanity
+
+cleanup;
diff --git a/tests/basic/gfid-access.t b/tests/basic/gfid-access.t
index 19b6564e676..fc29a19fc6c 100644
--- a/tests/basic/gfid-access.t
+++ b/tests/basic/gfid-access.t
@@ -8,6 +8,7 @@ cleanup;
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 $H0:$B0/${V0}0
+TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume start $V0
TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 --aux-gfid-mount;
TEST mkdir $M0/a
diff --git a/tests/basic/gfproxy.t b/tests/basic/gfproxy.t
new file mode 100644
index 00000000000..71c6788db76
--- /dev/null
+++ b/tests/basic/gfproxy.t
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../nfs.rc
+
+cleanup;
+
+function start_gfproxyd {
+ glusterfs --volfile-id=gfproxy/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log
+}
+
+function restart_gfproxyd {
+ pkill -f gfproxy/${V0}
+ start_gfproxyd
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 config.gfproxyd-remote-host $H0
+TEST $CLI volume start $V0
+
+sleep 2
+
+REGULAR_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-fuse.vol"
+GFPROXY_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-gfproxy-fuse.vol"
+GFPROXYD_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.gfproxyd.vol"
+
+# Client volfile must exist
+TEST [ -f $GFPROXY_CLIENT_VOLFILE ]
+
+# AHA & write-behind translators must exist
+TEST grep "cluster/aha" $GFPROXY_CLIENT_VOLFILE
+TEST grep "performance/write-behind" $GFPROXY_CLIENT_VOLFILE
+
+# Make sure we didn't screw up the existing client
+TEST grep "performance/write-behind" $REGULAR_CLIENT_VOLFILE
+TEST grep "cluster/replicate" $REGULAR_CLIENT_VOLFILE
+TEST grep "cluster/distribute" $REGULAR_CLIENT_VOLFILE
+
+TEST [ -f $GFPROXYD_VOLFILE ]
+
+TEST grep "cluster/replicate" $GFPROXYD_VOLFILE
+TEST grep "cluster/distribute" $GFPROXYD_VOLFILE
+
+# AHA & write-behind must *not* exist
+TEST ! grep "cluster/aha" $GFPROXYD_VOLFILE
+TEST ! grep "performance/write-behind" $GFPROXYD_VOLFILE
+
+# Test that we can start the server and the client
+TEST start_gfproxyd
+TEST glusterfs --volfile-id=gfproxy-client/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy-client.log $M0
+sleep 2
+TEST grep gfproxy-client/${V0} /proc/mounts
+
+# Write data to the mount and checksum it
+TEST dd if=/dev/urandom bs=1M count=10 of=/tmp/testfile1
+md5=$(md5sum /tmp/testfile1 | awk '{print $1}')
+TEST cp -v /tmp/testfile1 $M0/testfile1
+TEST [ "$(md5sum $M0/testfile1 | awk '{print $1}')" == "$md5" ]
+
+rm /tmp/testfile1
+
+dd if=/dev/zero of=$N0/bigfile bs=1M count=3072 &
+BG_STRESS_PID=$!
+
+sleep 3
+
+restart_gfproxyd
+
+TEST wait $BG_STRESS_PID
+
+cleanup;
diff --git a/tests/basic/glusterd/volfile_server_switch.t b/tests/basic/glusterd/volfile_server_switch.t
index 0b0e6470244..0b01398215c 100644
--- a/tests/basic/glusterd/volfile_server_switch.t
+++ b/tests/basic/glusterd/volfile_server_switch.t
@@ -1,5 +1,8 @@
#!/bin/bash
+#G_TESTDEF_TEST_STATUS_CENTOS6=KNOWN_ISSUE,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=KNOWN_ISSUE,BUG=000000
+
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../cluster.rc
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
new file mode 100644
index 00000000000..f3655eaef3b
--- /dev/null
+++ b/tests/basic/halo-failover-disabled.t
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+# brick immediatelly, and md5s will show they are equal once
+# the write completes.
+# 4. The mount should also be RW after the brick is killed as
+# quorum will be immediately restored by swapping in the
+# other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.halo-failover-enabled off
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+
+# Use a large ping time here so the spare brick is not marked up
+# based on the ping time. The only way it can get marked up is
+# by being swapped in via the down event (which is what we are disabling).
+TEST $CLI volume set $V0 network.ping-timeout 1000
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
+
+# Write some data to the mount
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+
+UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX}
+
+# Make sure two children are down and one is up.
+EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3
+
+# Test that quorum should fail and the mount is RO, the reason here
+# is that although there _is_ another brick running which _could_
+# take the failed bricks place, it is not marked "up" so quorum
+# will not be fullfilled. If we waited 1000 second the brick would
+# indeed be activated based on ping time, but for our test we want
+# the decision to be solely "down event" driven, not ping driven.
+TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX
+
+# Test that quorum should be restored and the file is writable
+TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+
+cleanup
diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t
new file mode 100644
index 00000000000..7d23d80968a
--- /dev/null
+++ b/tests/basic/halo-failover-enabled.t
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+# brick immediatelly, and md5s will show they are equal once
+# the write completes.
+# 4. The mount should also be RW after the brick is killed as
+# quorum will be immediately restored by swapping in the
+# other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-failover-enabled on
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 network.ping-timeout 20
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
+
+# Write some data to the mount
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+
+KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST [ -n "$KILL_IDX" ]
+# NB: UP_CHILDREN is the set of children that should be up after we kill
+# the brick indicated by KILL_IDX, *not* the set of children which are
+# currently up!
+UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g"))
+UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)"
+UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)"
+VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)"
+
+# The victim brick should have a copy of the file.
+TEST [ -n "$VICTIM_HAS_TEST" ]
+
+# Of the bricks which will remain standing, there should be only one
+# brick which has the file called test. If the both have the first
+# test file, the test is invalid as all the bricks are up and the
+# halo-max-replicas is not being honored; e.g. bug exists.
+TEST [ $([ -z "$UP1_HAS_TEST" ]) = $([ -z "$UP2_HAS_TEST" ]) ]
+
+echo "Failing child ${KILL_IDX}..."
+TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX}
+
+# Test the mount is still RW (i.e. quorum works)
+TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync
+
+# Calulate the MD5s
+MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1)
+MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1)
+
+# Verify the two up bricks have identical MD5s, if both are identical
+# then we must have successfully failed-over to the brick which was
+# previously proven to be down (via the ONLY_ONE test).
+TEST [ "$MD5_UP1" == "$MD5_UP2" ]
+
+cleanup
diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t
new file mode 100644
index 00000000000..4574fdfe41e
--- /dev/null
+++ b/tests/basic/halo-hybrid.t
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test for the Halo hybrid feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+# heal daemon is off to start.
+# 2. Write some data
+# 3. Verify hybrid code chose children for lookups
+# 4. Verify hybrid code chose child for reads
+# 5. Verify hybrid code wrote synchronously to all replicas
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function found_fuse_log_msg {
+ local dir="$1"
+ local msg="$2"
+ local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l)
+ if (( $cnt == 1 )); then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-hybrid-mode True
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level TRACE
+TEST $CLI volume start $V0
+
+# Start a synchronous mount
+TEST glusterfs --volfile-id=/$V0 \
+ --xlator-option *replicate*.halo-max-latency=9999 \
+ --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+sleep 2
+cd $M0
+
+TEST mkdir testdir
+TEST cd testdir
+for i in {1..5}
+do
+ dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+TEST ls -l
+
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs"
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child"
+
+B0_CNT=$(ls $B0/${V0}0/testdir | wc -l)
+B1_CNT=$(ls $B0/${V0}1/testdir | wc -l)
+B2_CNT=$(ls $B0/${V0}2/testdir | wc -l)
+
+# Writes should be synchronous, all should have same
+# file count
+TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))"
+
+cleanup
diff --git a/tests/basic/halo.t b/tests/basic/halo.t
new file mode 100644
index 00000000000..25aca3442ab
--- /dev/null
+++ b/tests/basic/halo.t
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Test for the Halo geo-replication feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+# heal daemon is off to start.
+# 2. Write some data
+# 3. Verify at least one of the bricks did not receive the writes.
+# 4. Turn the heal daemon on
+# 5. Within 30 seconds the SHD should async heal the data over
+# to the 3rd brick.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+
+for i in {1..5}
+do
+ dd if=/dev/urandom of=f bs=1M count=1 2>/dev/null
+ mkdir a; cd a;
+done
+
+B0_CNT=$(ls $B0/${V0}0 | wc -l)
+B1_CNT=$(ls $B0/${V0}1 | wc -l)
+B2_CNT=$(ls $B0/${V0}2 | wc -l)
+
+# One of the brick dirs should be empty
+TEST "(($B0_CNT == 0 || $B1_CNT == 0 || $B2_CNT == 0))"
+
+# Ok, turn the heal daemon on and verify it heals it up
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+cleanup
diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t
index 9df5cb45c3b..7f990c9aeb2 100755
--- a/tests/basic/mount-nfs-auth.t
+++ b/tests/basic/mount-nfs-auth.t
@@ -3,6 +3,13 @@
. $(dirname $0)/../include.rc
. $(dirname $0)/../nfs.rc
+# On test systems, connecting to ourselves by hostname appears at the other end
+# as coming from localhost, so that's what needs to go in exports files etc.
+# The only place we really need to use the actual hostname is in the Gluster
+# volume-create thing. Maybe it's an IPv6 thing, maybe it's just a crazy
+# resolver configuration, but this lets the test work.
+H0=localhost
+
# Our mount timeout must be as long as the time for a regular configuration
# change to be acted upon *plus* AUTH_REFRESH_TIMEOUT, not one replacing the
# other. Otherwise this process races vs. the one making the change we're
@@ -15,6 +22,9 @@ TEST glusterd
TEST pidof glusterd
TEST $CLI volume info
+H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1)
+H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}')
+
# Export variables for allow & deny
EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
@@ -28,13 +38,21 @@ V0L1="$V0/L1"
V0L2="$V0L1/L2"
V0L3="$V0L2/L3"
+NETGROUP_COMPLEX_ALLOW="storage storage.region\nstorage.region (1.2.3.4,,)\nngtop ng1\nng1 ($H0,,)"
+EXPORT_COMPLEX_RO_ALLOW="/$V0L1 @storage(sec=sys,rw,anonuid=0) @ngtop(sec=sys,ro,anonuid=0)"
+
# Other variations for allow & deny
+EXPORT_ALLOW_NETGROUP_RO="/$V0 @ngtop(sec=sys,ro,anonuid=0)"
EXPORT_ALLOW_RO="/$V0 $H0(sec=sys,ro,anonuid=0) @ngtop(sec=sys,ro,anonuid=0)"
EXPORT_ALLOW_L1="/$V0L1 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
EXPORT_WILDCARD="/$V0 *(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"
function build_dirs () {
- mkdir -p $B0/b{0,1,2}/L1/L2/L3
+ mkdir -p $B0/b{0,1,2,3,4,5}/L1/L2/L3
+}
+
+function export_allow_this_host_ipv6 () {
+ printf "$EXPORT_ALLOW6\n" > ${NFSDIR}/exports
}
function export_allow_this_host () {
@@ -46,6 +64,9 @@ function export_allow_this_host_with_slash () {
}
function export_deny_this_host () {
+ if [[ "$1" && "$1" != "$V0" ]]; then
+ local EXPORT_DENY=$(echo $EXPORT_DENY | sed "s/$V0/$1/")
+ fi
printf "$EXPORT_DENY\n" > ${NFSDIR}/exports
}
@@ -61,6 +82,10 @@ function export_allow_this_host_ro () {
printf "$EXPORT_ALLOW_RO\n" > ${NFSDIR}/exports
}
+function export_allow_netgroup_ro () {
+ printf "$EXPORT_ALLOW_NETGROUP_RO\n" > ${NFSDIR}/exports
+}
+
function netgroup_allow_this_host () {
printf "$NETGROUP_ALLOW\n" > ${NFSDIR}/netgroups
}
@@ -69,8 +94,16 @@ function netgroup_deny_this_host () {
printf "$NETGROUP_DENY\n" > ${NFSDIR}/netgroups
}
+function netgroup_complex_allow() {
+ printf "$NETGROUP_COMPLEX_ALLOW\n" > ${NFSDIR}/netgroup
+}
+
+function export_complex_ro_allow() {
+ printf "$EXPORT_COMPLEX_RO_ALLOW\n" > ${NFSDIR}/exports
+}
+
function create_vol () {
- $CLI vol create $V0 $H0:$B0/b0
+ $CLI vol create $V0 $(hostname):$B0/b0
}
function setup_cluster() {
@@ -104,6 +137,10 @@ function check_mount_failure {
fi
}
+function do_mount () {
+ mount_nfs $H0:/$1 $N0 nolock
+}
+
function small_write () {
dd if=/dev/zero of=$N0/test-small-write count=1 bs=1k 2>&1
if [ $? -ne 0 ]; then
@@ -150,10 +187,7 @@ setup_cluster
TEST $CLI vol set $V0 nfs.disable off
TEST $CLI vol start $V0
-# Get NFS state directory
-NFSDIR=$( $CLI volume get patchy nfs.mount-rmtab | \
- awk '/^nfs.mount-rmtab/{print $2}' | \
- xargs dirname )
+NFSDIR=/var/lib/glusterd/nfs
## Wait for volume to register with rpc.mountd
EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available
@@ -186,6 +220,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available
## Mount NFS
EXPECT "Y" check_mount_success $V0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+
+## Mount NFS using the IPv6 export
+export_allow_this_host_ipv6
+EXPECT "Y" check_mount_success $V0
## Disallow host
TEST export_deny_this_host
@@ -260,6 +299,31 @@ TEST ! create # Create should not be allowed
TEST stat_nfs # Stat should be allowed
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+TEST export_allow_netgroup_ro
+TEST netgroup_allow_this_host
+sleep $((AUTH_REFRESH_INTERVAL+1))
+
+EXPECT_WITHIN $MY_MOUNT_TIMEOUT "Y" check_mount_success $V0
+# TBD: figure out why these two tests fail, so they can be reenabled
+#EST ! small_write # Writes should not be allowed
+#EST ! create # Create should not be allowed
+TEST stat_nfs # Stat should be allowed
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+
+# This test checks the case where the exports file
+# has a 'rw' perm set for a netgroup followed
+# by a 'ro' perm for a different netgroup.
+TEST netgroup_complex_allow
+TEST export_complex_ro_allow
+sleep $((AUTH_REFRESH_INTERVAL+1))
+
+EXPECT_WITHIN $MY_MOUNT_TIMEOUT "Y" check_mount_success $V0L1
+# TBD: figure out why these two tests fail, so they can be reenabled
+#EST ! small_write # Writes should not be allowed
+#EST ! create # Create should not be allowed
+TEST stat_nfs # Stat should be allowed
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+
TEST export_deny_this_host
TEST netgroup_deny_this_host
TEST export_allow_this_host_l1 # Allow this host at L1
@@ -320,9 +384,40 @@ TEST $CLI vol set $V0 nfs.auth-refresh-interval-sec 20
## Do a simple test to see if the volume option exists
TEST $CLI vol set $V0 nfs.auth-cache-ttl-sec 400
+## Test authentication in 1 of 2 (sub)volumes
+ME=$(hostname)
+TEST $CLI vol create $V1 replica 3 $ME:$B0/b3 $ME:$B0/b4 $ME:$B0/b5
+TEST $CLI vol set $V1 cluster.self-heal-daemon off
+TEST $CLI vol set $V1 nfs.disable off
+TEST $CLI vol set $V1 cluster.choose-local off
+TEST $CLI vol start $V1
+TEST $CLI volume info $V1;
+
+EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "2" is_nfs_export_available $V0
+EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available $V1
+TEST $CLI vol set $V0 nfs.exports-auth-enable on
+TEST $CLI vol set $V1 nfs.exports-auth-enable off
+# Deny the hosts, but only effective on $V0
+TEST export_deny_this_host $V0
+TEST netgroup_deny_this_host
+TEST export_deny_this_host $V1
+
+sleep $AUTH_REFRESH_INTERVAL
+TEST ! do_mount $V0 # Do a mount & test
+TEST do_mount $V1 # Do a mount & test
+
+TEST touch /tmp/foo
+TEST cp /tmp/foo $N0/
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
+
## Finish up
TEST $CLI volume stop $V0
TEST $CLI volume delete $V0;
TEST ! $CLI volume info $V0;
+TEST $CLI volume stop $V1
+TEST $CLI volume delete $V1;
+TEST ! $CLI volume info $V1;
+
cleanup
diff --git a/tests/basic/pgfid-feat.t b/tests/basic/pgfid-feat.t
index a7baeec7b7a..615a0cd867e 100644
--- a/tests/basic/pgfid-feat.t
+++ b/tests/basic/pgfid-feat.t
@@ -16,6 +16,7 @@ TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4};
+TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 build-pgfid on;
TEST $CLI volume start $V0;
diff --git a/tests/basic/quota-anon-fd-nfs.t b/tests/basic/quota-anon-fd-nfs.t
index d911cc90b87..a6dec6bfcf8 100755
--- a/tests/basic/quota-anon-fd-nfs.t
+++ b/tests/basic/quota-anon-fd-nfs.t
@@ -17,6 +17,7 @@ TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 $H0:$B0/brick1;
+TEST $CLI volume set $V0 performance.stat-prefetch off
EXPECT 'Created' volinfo_field $V0 'Status';
TEST $CLI volume set $V0 nfs.disable false
diff --git a/tests/basic/quota.t b/tests/basic/quota.t
index 7f8b21de6f8..99af5a4e7e4 100755
--- a/tests/basic/quota.t
+++ b/tests/basic/quota.t
@@ -19,6 +19,7 @@ TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4};
+TEST $CLI volume set $V0 performance.stat-prefetch off
EXPECT "$V0" volinfo_field $V0 'Volume Name';
EXPECT 'Created' volinfo_field $V0 'Status';
diff --git a/tests/basic/rpc-coverage.t b/tests/basic/rpc-coverage.t
index a76ba7084eb..b5221dcd9dd 100755..100644
--- a/tests/basic/rpc-coverage.t
+++ b/tests/basic/rpc-coverage.t
@@ -10,6 +10,7 @@ TEST pidof glusterd
TEST $CLI volume info;
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4,5,6,7,8};
+TEST $CLI volume set $V0 performance.stat-prefetch off
EXPECT "$V0" volinfo_field $V0 'Volume Name';
EXPECT 'Created' volinfo_field $V0 'Status';
diff --git a/tests/basic/stats-dump.t b/tests/basic/stats-dump.t
index 7da6e0605a4..2840498218b 100644
--- a/tests/basic/stats-dump.t
+++ b/tests/basic/stats-dump.t
@@ -12,6 +12,7 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume set $V0 diagnostics.latency-measurement on
TEST $CLI volume set $V0 diagnostics.count-fop-hits on
TEST $CLI volume set $V0 diagnostics.stats-dump-interval 1
+TEST $CLI volume set $V0 performance.nfs.io-threads on
TEST $CLI volume set $V0 nfs.disable off
TEST $CLI volume start $V0
EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available
@@ -36,6 +37,10 @@ NFSD_RET="$?"
FUSE_OUTPUT="$(grep 'aggr.fop.write.count": "0"' ${GLUSTERD_WORKDIR}/stats/glusterfs_patchy.dump)"
FUSE_RET="$?"
+# Test that io-stats is getting queue sizes from io-threads
+TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfs_nfsd.dump
+TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump
+
TEST [ 0 -ne "$BRICK_RET" ]
TEST [ 0 -ne "$NFSD_RET" ]
TEST [ 0 -ne "$FUSE_RET" ]
diff --git a/tests/basic/uss.t b/tests/basic/uss.t
index 6cfc0303895..d6ca416bd65 100644
--- a/tests/basic/uss.t
+++ b/tests/basic/uss.t
@@ -382,3 +382,5 @@ TEST ls $M0/.history/snap6/;
TEST ! stat $M0/.history/snap6/aaa;
cleanup;
+
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
diff --git a/tests/basic/write-behind.t b/tests/basic/write-behind.t
new file mode 100644
index 00000000000..edad59786af
--- /dev/null
+++ b/tests/basic/write-behind.t
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function clear_stats {
+ > /var/lib/glusterfs/stats/glusterfs_d_backends_${V0}0.dump
+}
+
+function got_expected_write_count {
+ expected_size=$1
+ expected_value=$2
+ grep aggr.write_${expected_size} "/var/lib/glusterd/stats/glusterfsd__d_backends_${V0}0.dump" | grep $expected_value
+ if [ $? == 0 ]; then
+ echo "Y";
+ else
+ echo "N";
+ fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+
+# These are needed for our tracking of write sizes
+TEST $CLI volume set $V0 diagnostics.latency-measurement on
+TEST $CLI volume set $V0 diagnostics.count-fop-hits on
+TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2
+
+# Disable this in testing to get deterministic results
+TEST $CLI volume set $V0 performance.write-behind-trickling-writes off
+
+TEST $CLI volume start $V0
+
+sleep 2;
+
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
+
+# Write a 100MB file with a window-size 1MB, we should get 100 writes of 1MB each
+TEST dd if=/dev/zero of=$M0/100mb_file bs=1M count=100
+EXPECT_WITHIN 5 "Y" got_expected_write_count "1mb" 100
+
+TEST $CLI volume set $V0 performance.write-behind-window-size 512KB
+
+# Write a 100MB file with a window-size 512KB, we should get 200 writes of 512KB each
+TEST dd if=/dev/zero of=$M0/100mb_file_2 bs=1M count=100
+EXPECT_WITHIN 5 "Y" got_expected_write_count "512kb" 200
+
+cleanup;
diff --git a/tests/bugs/distribute/bug-1099890.t b/tests/bugs/distribute/bug-1099890.t
index 1a19ba880c0..9f8ae1487cc 100644
--- a/tests/bugs/distribute/bug-1099890.t
+++ b/tests/bugs/distribute/bug-1099890.t
@@ -44,6 +44,8 @@ TEST $CLI volume set $V0 features.quota-deem-statfs on
TEST $CLI volume quota $V0 limit-usage / 150MB;
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1
+
TEST $CLI volume set $V0 cluster.min-free-disk 50%
TEST glusterfs -s $H0 --volfile-id=$V0 $M0
diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t
index c5a7f041ac8..8cf905a8f0b 100755
--- a/tests/bugs/distribute/bug-1161311.t
+++ b/tests/bugs/distribute/bug-1161311.t
@@ -53,8 +53,14 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0;
TEST mkdir $M0/dir1
TEST mkdir -p $M0/dir2/dir3
-# Create a large file (1GB), so that rebalance takes time
-dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240
+# Create a large file (6.4 GB), so that rebalance takes time
+# Reading from /dev/urandom is slow, so we'll cat it together
+dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240
+for i in {1..10}; do
+ cat /tmp/FILE2 >> $M0/dir1/FILE2
+done
+
+#dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240
# Rename the file to create a linkto, for rebalance to
# act on the file
diff --git a/tests/bugs/fb4482137.t b/tests/bugs/fb4482137.t
new file mode 100755
index 00000000000..bd3be89326b
--- /dev/null
+++ b/tests/bugs/fb4482137.t
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+#
+# Test the scenario where a SHD daemon suffers a frame timeout during a
+# crawl. The expected behavior is that present crawl will continue
+# after the timeout and not deadlock.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+function wait_for_shd_no_sink() {
+ local TIMEOUT=$1
+ # If we see the "no active sinks" log message we know
+ # the heal is alive. It cannot proceed as the "sink"
+ # is hung, but it's at least alive and trying.
+ timeout $TIMEOUT grep -q 'replicate-0: no active sinks for' \
+ <(tail -fn0 /var/log/glusterfs/glustershd.log)
+ return $?
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info 2> /dev/null;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 network.frame-timeout 2
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.heal-timeout 10
+TEST $CLI volume start $V0
+sleep 5
+
+# Mount the volume
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+# Kill bricks 1
+TEST kill_brick $V0 $H0 $B0/${V0}1
+sleep 1
+
+# Write some data into the mount which will require healing
+cd $M0
+for i in {1..1000}; do
+ dd if=/dev/urandom of=testdata_$i bs=64k count=1 2>/dev/null
+done
+
+# Re-start the brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+
+sleep 1
+TEST hang_brick $V0 $H0 $B0/${V0}1
+sleep 4
+TEST wait_for_shd_no_sink 20
+cleanup
+
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
diff --git a/tests/bugs/fb8149516.t b/tests/bugs/fb8149516.t
new file mode 100644
index 00000000000..54372794c6f
--- /dev/null
+++ b/tests/bugs/fb8149516.t
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.read-subvolume-index 2
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.heal-timeout 30
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 nfs.disable off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+for i in {1..10}
+do
+ dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+cd ~
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST rm -rf $B0/${V0}2/testfile*
+TEST rm -rf $B0/${V0}2/.glusterfs
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status_in_shd $V0 2
+
+# Verify we see all ten files when ls'ing, without the patch this should
+# return no files and fail.
+FILE_LIST=($(\ls $M0))
+TEST "((${#FILE_LIST[@]} == 10))"
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+
+cleanup
diff --git a/tests/bugs/fuse/bug-858488-min-free-disk.t b/tests/bugs/fuse/bug-858488-min-free-disk.t
index 635dc04d1e6..ab636575d3f 100644
--- a/tests/bugs/fuse/bug-858488-min-free-disk.t
+++ b/tests/bugs/fuse/bug-858488-min-free-disk.t
@@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2
## Lets create volume
TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
+TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1
## Verify volume is created
EXPECT "$V0" volinfo_field $V0 'Volume Name';
diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
index 9fc7ac3b845..3bc80ab9dab 100644
--- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
+++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
@@ -1,6 +1,6 @@
#!/bin/bash
-## Test case for cluster.min-free-disk option validation.
+## Test case for cluster.cluster.min-free-disk option validation.
. $(dirname $0)/../../include.rc
@@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2
TEST $CLI volume start $V0
## Setting invalid value for option cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk 143.!/12
-TEST ! $CLI volume set $V0 min-free-disk 123%
-TEST ! $CLI volume set $V0 min-free-disk 194.34%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12
+TEST ! $CLI volume set $V0 cluster.min-free-disk 123%
+TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34%
## Setting fractional value as a size (unit is byte) for option
## cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk 199.051
-TEST ! $CLI volume set $V0 min-free-disk 111.999
+TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051
+TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999
## Setting valid value for option cluster.min-free-disk should pass
-TEST $CLI volume set $V0 min-free-disk 12%
-TEST $CLI volume set $V0 min-free-disk 56.7%
-TEST $CLI volume set $V0 min-free-disk 120
-TEST $CLI volume set $V0 min-free-disk 369.0000
+TEST $CLI volume set $V0 cluster.min-free-disk 12%
+TEST $CLI volume set $V0 cluster.min-free-disk 56.7%
+TEST $CLI volume set $V0 cluster.min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 369.0000
cleanup;
diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t
index c30d2b852d4..1b9ca18c08a 100755
--- a/tests/bugs/glusterd/bug-859927.t
+++ b/tests/bugs/glusterd/bug-859927.t
@@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " "
TEST $CLI volume set $V0 min-free-inodes 60%
EXPECT "60%" volume_option $V0 cluster.min-free-inodes
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk " "
-TEST $CLI volume set $V0 min-free-disk 60%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk " "
+TEST $CLI volume set $V0 cluster.min-free-disk 60%
EXPECT "60%" volume_option $V0 cluster.min-free-disk
-TEST $CLI volume set $V0 min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 120
EXPECT "120" volume_option $V0 cluster.min-free-disk
TEST ! $CLI volume set $V0 frame-timeout ""
diff --git a/tests/bugs/nfs/bug-1166862.t b/tests/bugs/nfs/bug-1166862.t
index f986fe36ab7..fd57ccb992b 100755
--- a/tests/bugs/nfs/bug-1166862.t
+++ b/tests/bugs/nfs/bug-1166862.t
@@ -65,3 +65,7 @@ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
EXPECT '0' count_lines cat $GLUSTERD_WORKDIR/nfs/rmtab
cleanup
+
+# rmtab support permanently hacked out on FB branch.
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
diff --git a/tests/bugs/nfs/bug-904065.t b/tests/bugs/nfs/bug-904065.t
index 0becb756da4..0d539a2341c 100755
--- a/tests/bugs/nfs/bug-904065.t
+++ b/tests/bugs/nfs/bug-904065.t
@@ -90,3 +90,7 @@ EXPECT '2' count_lines $M0/rmtab
# rmtab.
cleanup
+
+# rmtab support permanently hacked out on FB branch.
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
diff --git a/tests/bugs/quota/bug-1292020.t b/tests/bugs/quota/bug-1292020.t
index 14b311c9d76..f713c74859b 100644
--- a/tests/bugs/quota/bug-1292020.t
+++ b/tests/bugs/quota/bug-1292020.t
@@ -4,10 +4,12 @@
. $(dirname $0)/../../volume.rc
function write_sample_data () {
- dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 | grep -i exceeded
+ dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 |
+ egrep -i 'exceeded|no space' && echo 'passed'
}
cleanup;
+rm -f /tmp/kbv.log
TEST glusterd;
TEST pidof glusterd;
@@ -18,7 +20,8 @@ TEST $CLI volume quota $V0 enable;
TEST $CLI volume quota $V0 limit-usage / 1
TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0;
-EXPECT "exceeded" write_sample_data
+
+EXPECT "passed" write_sample_data
TEST $CLI volume stop $V0
TEST $CLI volume delete $V0
diff --git a/tests/bugs/replicate/bug-859581.t b/tests/bugs/replicate/bug-859581.t
index d8b45a257a1..313067b6049 100755
--- a/tests/bugs/replicate/bug-859581.t
+++ b/tests/bugs/replicate/bug-859581.t
@@ -51,3 +51,5 @@ TEST $CLI volume delete $V0
cleanup
+#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
diff --git a/tests/cluster.rc b/tests/cluster.rc
index 467bbcb06e1..42547f09e37 100644
--- a/tests/cluster.rc
+++ b/tests/cluster.rc
@@ -46,17 +46,18 @@ function define_glusterds() {
bopt="management.transport.socket.bind-address=${!h}";
popt="--pid-file=${!b}/glusterd.pid";
sopt="management.glusterd-sockfile=${!b}/glusterd/gd.sock"
+ aopt="*.transport.address-family=inet"
#Get the logdir
logdir=`gluster --print-logdir`
#Fetch the testcases name and prefix the glusterd log with it
logfile=`echo ${0##*/}`_glusterd$i.log
lopt="--log-file=$logdir/$logfile"
if [ "$2" == "-LDEBUG" ]; then
- eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
- eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
+ eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
+ eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
else
- eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
- eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'";
+ eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
+ eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";
fi
done
}
diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6
new file mode 100644
index 00000000000..426b1ef5705
--- /dev/null
+++ b/tests/configfiles/exports-v6
@@ -0,0 +1 @@
+/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,)
diff --git a/tests/env.rc.in b/tests/env.rc.in
index 82971c4a8de..87befc3711d 100644
--- a/tests/env.rc.in
+++ b/tests/env.rc.in
@@ -28,3 +28,6 @@ export PYTHON
PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH
export PYTHONPATH
+
+TESTER_CFLAGS="@TESTER_CFLAGS@"
+export TESTER_CFLAGS
diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t
new file mode 100755
index 00000000000..0fc5a241534
--- /dev/null
+++ b/tests/features/brick-min-free-space.t
@@ -0,0 +1,121 @@
+#!/bin/bash
+#
+# Test storage.min-free-disk option works.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST truncate -s 16M $B0/brick0
+TEST LOOPDEV=$(losetup --find --show $B0/brick0)
+TEST mkfs.xfs $LOOPDEV
+
+mkdir -p $B0/$V0
+
+TEST mount -t xfs $LOOPDEV $B0/$V0
+
+###########
+# AIO on #
+###########
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio on
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+sleep 5
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+sleep 5
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+############
+# AIO off #
+############
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+sleep 5
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+sleep 5
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup;
diff --git a/tests/features/lock_revocation.t b/tests/features/lock_revocation.t
new file mode 100644
index 00000000000..cbf21b71650
--- /dev/null
+++ b/tests/features/lock_revocation.t
@@ -0,0 +1,52 @@
+#!/bin/bash
+logdir=$(gluster --print-logdir)
+BRICK_LOGFILES="$logdir/bricks/d-backends-brick?.log"
+rm -f $BRICK_LOGFILES &> /dev/null
+
+# Test that lock revocation works
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+cleanup;
+
+function deadlock_fop() {
+ local MNT=$1
+ for i in {1..1000}; do
+ dd if=/dev/zero of=$MNT/testfile bs=1k count=10 &> /dev/null
+ if grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null; then
+ break
+ fi
+ done
+}
+
+function monkey_unlock() {
+ grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null && echo SUCCESS
+ return 0
+}
+
+function append_to_file() {
+ local FILE_PATH=$1
+ echo "hello" >> $FILE_PATH
+ return 0
+}
+
+#Init
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 features.locks-monkey-unlocking on
+TEST $CLI volume set $V0 features.locks-revocation-secs 2
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 -s $H0 $M0;
+TEST $GFS --volfile-id=$V0 -s $H0 $M1;
+
+# Deadlock writes to a file using monkey unlocking
+deadlock_fop $M0 &
+EXPECT_WITHIN 60 "SUCCESS" monkey_unlock
+
+# Sleep > unlock timeout and attempt to write to the file
+sleep 3
+TEST append_to_file $M1/testfile
+
+cleanup
diff --git a/tests/halo.rc b/tests/halo.rc
new file mode 100644
index 00000000000..4cb7c81da85
--- /dev/null
+++ b/tests/halo.rc
@@ -0,0 +1,52 @@
+# Return the current Halo state of a given child (by index, i.e. 0
+# is first child).
+function halo_child_state {
+ grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG |
+ tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//'
+}
+
+# Return number of Halo children which are in a given state.
+# First parameter is total # children.
+# Second parameter is state to match (e.g. "UP").
+function halo_children_in_state {
+ local CHILD_COUNT=$1
+ local SUM=0
+ for CHILD in $(seq 0 $((CHILD_COUNT-1))); do
+ if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then
+ SUM=$((SUM+1))
+ fi
+ done
+ echo $SUM
+}
+
+# Return number of up halo children,
+# First parameter is total # children,
+function halo_children_up {
+ echo $(halo_children_in_state $1 "UP")
+}
+
+# Return number of down halo children,
+# First parameter is total # children,
+function halo_children_down {
+ echo $(halo_children_in_state $1 "DOWN")
+}
+
+# Return number of up & down halo children.
+# First parameter is total number of children.
+function halo_sum_child_states {
+ local CHILD_COUNT=$1
+
+ local UP=0
+ local DOWN=0
+
+ for CHILD in $(seq 0 $((CHILD_COUNT-1))); do
+ local STATE=$(halo_child_state $CHILD)
+ if [ x"$STATE" == x"UP" ]; then
+ UP=$((UP+1))
+ elif [ x"$STATE" == x"DOWN" ]; then
+ DOWN=$((DOWN+1))
+ fi
+ done
+
+ echo "$UP $DOWN"
+}
diff --git a/tests/include.rc b/tests/include.rc
index 492e35a7b6c..8b6504e6c58 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -19,11 +19,13 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g
CC=cc
OSTYPE=$(uname -s)
-ENV_RC=$(dirname $0)/../env.rc
+M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point
+
+ENV_RC=$(dirname $0)/env.rc
if [ ! -f $ENV_RC ]; then
- ENV_RC=$(dirname $0)/../../env.rc
+ ENV_RC=$(dirname $0)/../env.rc
if [ ! -f $ENV_RC ]; then
- ENV_RC=$(dirname $0)/../../../env.rc
+ ENV_RC=$(dirname $0)/../../env.rc
fi
fi
@@ -171,6 +173,7 @@ function test_footer()
echo "FAILED COMMAND: $saved_cmd"
fi
if [ "$EXIT_EARLY" = "1" ]; then
+ cleanup
exit $RET
fi
fi
@@ -350,6 +353,7 @@ which killall > /dev/null || {
which pidof > /dev/null || {
pidof() {
+
$PYTHON pidof.py $@
}
}
@@ -422,11 +426,13 @@ stat -c %s /dev/null > /dev/null 2>&1 || {
function cleanup()
{
+ local OLDPWD=$PWD
+ cd # Things go pear-shaped if we're inside a Gluster mount.
# Prepare flags for umount
case `uname -s` in
Linux)
- flag="-l"
+ flag="-l -f --no-canonicalize"
;;
NetBSD)
flag="-f -R"
@@ -573,6 +579,8 @@ function cleanup()
# above to fail, promoting that into a failure of the whole test (and
# thus of an entire regression-test run) seems a bit excessive. Make
# sure we return good status anyway.
+
+ cd $OLDPWD
return 0
}
@@ -612,6 +620,7 @@ function build_tester ()
then
cflags="$cflags $(pkg-config glusterfs-api --cflags-only-I --libs-only-L)"
fi
+ cflags="$cflags ${TESTER_CFLAGS}"
$CC -g -o $(dirname $cfile)/$execname $cfile $cflags
}
@@ -1163,3 +1172,5 @@ function STAT_INO()
echo 0
fi
}
+
+systemctl stop nfs-mountd
diff --git a/tests/nfs.rc b/tests/nfs.rc
index 2140f311c33..ee52d96e6d3 100644
--- a/tests/nfs.rc
+++ b/tests/nfs.rc
@@ -23,7 +23,7 @@ function mount_nfs ()
local m=$2
local opt=$3
if [ ! -z "$opt" ]; then opt=",$opt"; fi
- opt="soft,intr,vers=3$opt"
+ opt="soft,intr,nfsvers=3,proto=tcp$opt"
nopt=""
for o in ${opt//,/ }; do
diff --git a/tests/volume.rc b/tests/volume.rc
index f95c0013b2e..84630f3d4b4 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -237,6 +237,13 @@ function kill_brick {
kill -9 $(get_brick_pid $vol $host $brick)
}
+function hang_brick {
+ local vol=$1
+ local host=$2
+ local brick=$3
+ kill -STOP $(get_brick_pid $vol $host $brick)
+}
+
function check_option_help_presence {
local option=$1
$CLI volume set help | grep "^Option:" | grep -w $option
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 903fbb39f12..bce94bb8b3b 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht ec
+SUBDIRS = aha stripe afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7532b014ff7..4c2343f8e9b 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -43,6 +43,13 @@
#include "afr-self-heald.h"
#include "afr-messages.h"
+#define CHILD_UP_STR "UP"
+#define CHILD_DOWN_STR "DOWN"
+#define CHILD_DISCONNECTED_STR "DOWN"
+
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *fastest_children);
+
call_frame_t *
afr_copy_frame (call_frame_t *base)
{
@@ -1078,7 +1085,8 @@ refresh_done:
int
afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)
{
- call_frame_t *heal_frame = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal_frame = NULL;
afr_local_t *local = NULL;
gf_boolean_t start_heal = _gf_false;
afr_local_t *heal_local = NULL;
@@ -1092,13 +1100,15 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)
}
local = frame->local;
+ priv = this->private;
ret = afr_replies_interpret (frame, this, local->refreshinode,
&start_heal);
err = afr_inode_refresh_err (frame, this);
- if (ret && afr_selfheal_enabled (this) && start_heal) {
+ if (priv->did_discovery == _gf_false ||
+ (afr_selfheal_enabled (this) && start_heal)) {
heal_frame = copy_frame (frame);
if (!heal_frame)
goto refresh_done;
@@ -1380,6 +1390,12 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
"Unable to set list-xattr in dict ");
}
+ ret = dict_set_int32 (xattr_req, GET_ANCESTRY_PATH_KEY, 42);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set ancestry path key in dict ");
+ }
+
return ret;
}
@@ -1466,21 +1482,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
sizeof(gfid_copy)) % child_count;
}
+/*
+ * afr_halo_read_subvol
+ *
+ * Given a array representing the readable children, this function will
+ * return which one of the readable children meet the halo hybrid criteria.
+ * In the event none are found, -1 is returned and another strategy will have
+ * to be used to figure out where the read should come from.
+ */
+int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) {
+ afr_private_t *priv = NULL;
+ unsigned char *hybrid_children;
+ int32_t hybrid_cnt = 0;
+ int read_subvol = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ /* Halo in-active or hybrid mode disabled, bail.... */
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return -1;
+
+ /* AFR Discovery edge case, if you are already pinned to a child
+ * which meets the latency threshold then go with this child for
+ * consistency purposes.
+ */
+ if (priv->read_child >= 0 && readable[priv->read_child] &&
+ priv->child_latency[priv->read_child] <=
+ AFR_HALO_HYBRID_LATENCY_MSEC) {
+ return priv->read_child;
+ }
+
+ hybrid_children = alloca0 (priv->child_count);
+ hybrid_cnt = find_hybrid_children (this, hybrid_children);
+ if (hybrid_cnt) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i] && hybrid_children[i]) {
+ read_subvol = i;
+ priv->read_child = read_subvol;
+ gf_log (this->name, GF_LOG_TRACE,
+ "Selected hybrid child %d for reads",
+ i);
+ break;
+ }
+ }
+ }
+
+ return read_subvol;
+}
+
int
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
unsigned char *readable,
afr_read_subvol_args_t *args)
{
- int i = 0;
- int read_subvol = -1;
- afr_private_t *priv = NULL;
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
afr_read_subvol_args_t local_args = {0,};
- priv = this->private;
+ priv = this->private;
- /* first preference - explicitly specified or local subvolume */
- if (priv->read_child >= 0 && readable[priv->read_child])
+ /* Choose lowest latency child for reads */
+ read_subvol = afr_halo_read_subvol (this, readable);
+ if (read_subvol != -1)
+ return read_subvol;
+
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
return priv->read_child;
if (inode_is_linked (inode)) {
@@ -1506,7 +1576,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
return -1;
}
-
int
afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
unsigned char *readable, int *event_p,
@@ -1697,6 +1766,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->readable);
GF_FREE (local->readable2);
+ GF_FREE (local->heal_ancestry_path);
+
if (local->inode)
inode_unref (local->inode);
@@ -2166,6 +2237,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv->children[child_index]->name);
priv->read_child = child_index;
+ } else if (priv->halo_enabled) {
+ if (priv->read_child < 0) {
+ priv->read_child = child_index;
+ } else if (priv->child_latency[child_index] <
+ priv->child_latency[priv->read_child]) {
+ priv->read_child = child_index;
+ }
}
out:
STACK_DESTROY(frame->root);
@@ -2357,7 +2435,6 @@ unwind:
return 0;
}
-
int
afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
{
@@ -2523,6 +2600,8 @@ unwind:
local->op_errno = ENOTCONN;
}
+ priv->did_discovery = _gf_true;
+
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->replies[read_subvol].poststat,
local->replies[read_subvol].xdata,
@@ -2555,7 +2634,7 @@ afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->replies[child_index].xdata = dict_ref (xdata);
}
- if (local->do_discovery && (op_ret == 0))
+ if (local->do_local_discovery && (op_ret == 0))
afr_attempt_local_discovery (this, child_index);
if (xdata) {
@@ -2583,6 +2662,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int call_count = 0;
+ unsigned char *hybrid_children = NULL;
local = frame->local;
priv = this->private;
@@ -2593,8 +2673,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
goto out;
}
- call_count = local->call_count = AFR_COUNT (local->child_up,
- priv->child_count);
+ hybrid_children = alloca0 (priv->child_count);
+ call_count = find_hybrid_children (this, hybrid_children);
+ if (call_count) {
+ for (i = 0; i < priv->child_count; i++)
+ local->child_up[i] = hybrid_children[i];
+ gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid "
+ "children for LOOKUPs", call_count);
+ } else {
+ hybrid_children = NULL;
+ call_count = AFR_COUNT (local->child_up, priv->child_count);
+ }
+
+ local->call_count = call_count;
ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
&local->loc);
@@ -2648,12 +2739,12 @@ afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req
if (!priv->root_inode)
priv->root_inode = inode_ref (loc->inode);
- if (priv->choose_local && !priv->did_discovery) {
+ if (priv->choose_local && !priv->did_local_discovery) {
/* Logic to detect which subvolumes of AFR are
local, in order to prefer them for reads
*/
- local->do_discovery = _gf_true;
- priv->did_discovery = _gf_true;
+ local->do_local_discovery = _gf_true;
+ priv->did_local_discovery = _gf_true;
}
}
@@ -2827,6 +2918,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,
AFR_DATA_TRANSACTION, NULL);
+ /* So this is the "secret" to why "Hybrid" halo works. Encoded in
+ * the cached inodes, we store what is effectively the "generational"
+ * state of the cluster along with a "packed" version of the extended
+ * attributes which determine which nodes are wise/fools. We can
+ * consult these cached values to figure out who we can trust, in the
+ * event the state of our cluster changes and we can no longer trust
+ * the cached info we "refresh" the inode (and hit all regions) to
+ * ensure we know which bricks we can safely read from.
+ */
if (event != local->event_generation)
afr_inode_refresh (frame, this, loc->parent, NULL,
afr_lookup_do);
@@ -3051,7 +3151,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_UNWIND (flush, frame, local->op_ret,
@@ -4317,25 +4417,569 @@ __get_heard_from_all_status (xlator_t *this)
return heard_from_all;
}
+/*
+ * afr_cmp_child
+ *
+ * Passed to the qsort function to order a list of children by the latency
+ * and/or up/down states.
+ *
+ * Note: This isn't as simple as taking the latencies and calling it a
+ * a day. Children can be marked down, which overrides their latency
+ * signal. Having a lower-latency child available doesn't guarentee this
+ * child shall be marked up: we don't want to constantly be swapping
+ * slightly better bricks for others...this is jarring to clients and
+ * could cause all sorts of issues. Plus, the fail-over, max-replicas
+ * flags must all be honored which manage the up/down state of children.
+ *
+ * In short, the (as marked) up/down down state of the brick shall always
+ * take precedence when sorting by latency.
+ */
+static int
+_afr_cmp_child (const void *child1, const void *child2)
+{
+ struct afr_child *child11 = (struct afr_child *)child1;
+ struct afr_child *child22 = (struct afr_child *)child2;
+
+ /* If both children are _marked_ down they are equal */
+ if (!child11->child_up && !child22->child_up)
+ return 0;
+
+ /* Prefer child 2, child 1 is _marked_ down, child 2 is not */
+ if (!child11->child_up && child22->child_up)
+ return 1;
+
+ /* Prefer child 1, child 2 is _marked_ down, child 1 is not */
+ if (child11->child_up && !child22->child_up)
+ return -1;
+
+ if (child11->latency > child22->latency) {
+ return 1;
+ }
+ if (child11->latency == child22->latency) {
+ return 0;
+ }
+ return -1;
+}
+
+/*
+ * find_hybrid_children
+ *
+ * Given a char array representing our children (aka bricks within our AFR
+ * AFR "subvolume"), we'll mark this array with the children which are
+ * within the halo_hybrid_read_max_latency_sec or if none fit this condition,
+ * we'll pick the fastest two bricks.
+ *
+ * You might ask, why not just pick the quickest brick and be done with it?
+ * Well, being within our set is not suffcient to be chosen for the read,
+ * we must also be marked "readable", we still want to choose as many as
+ * we can within our local region to ensure we have somebody that is readable.
+ *
+ * To illustrate this, consider the case where a 1/2 bricks received a sync
+ * from some other writer, and the 2nd brick although faster wasn't present.
+ * In this case we'll want to use the slower brick to service the read.
+ *
+ * In short, this function just tells the caller which hybrid children,
+ * it gives no signal as to their readability, nor should it since this is
+ * handled later in the various flows (e.g. by afr_halo_read_subvol).
+ */
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *hybrid_children)
+{
+ int32_t i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_child *sorted_list = NULL;
+ uint32_t max_latency;
+ uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT;
+
+ priv = this->private;
+
+ if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+ return 0;
+
+ if (limit > priv->child_count)
+ limit = priv->child_count;
+
+ max_latency = priv->halo_hybrid_read_max_latency_msec;
+
+ sorted_list = alloca (sizeof (struct afr_child) * priv->child_count);
+
+ /* Find children meeting the latency threshold */
+ for (i = 0; i < priv->child_count; i++) {
+ sorted_list[i].idx = i;
+ sorted_list[i].child_up = priv->child_up[i];
+ sorted_list[i].latency = priv->child_latency[i];
+ }
+
+ /* QuickSort the children according to latency */
+ qsort (sorted_list, priv->child_count, sizeof (struct afr_child),
+ _afr_cmp_child);
+
+ i = 0;
+ while (i < priv->child_count && sorted_list[i].latency <= max_latency)
+ hybrid_children[sorted_list[i++].idx] = 1;
+
+ /* Found some candidates */
+ if (i != 0)
+ return i;
+
+ /* If no candidates can be found meeting the max_latency threshold
+ * then find the best of those we have to our limit.
+ */
+ for (i = 0; i < limit; i++)
+ hybrid_children[sorted_list[i].idx] = 1;
+
+ return i;
+}
+
+int
+find_best_down_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
+ }
+ }
+ if (best_child >= 0) {
+ gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) "
+ "@ %ld ms latency", best_child, best_latency);
+ }
+ return best_child;
+}
+
+int
+find_worst_up_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] >= worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
+ }
+ }
+ if (worst_child >= 0) {
+ gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)"
+ " @ %ld ms latency", worst_child, worst_latency);
+ }
+ return worst_child;
+}
+
+static const char *halo_state_str(int i)
+{
+ switch (i) {
+ case 0: return "DOWN";
+ case 1: return "UP";
+ }
+
+ return "unknown";
+}
+
+
+static void dump_halo_states (xlator_t *this) {
+ afr_private_t *priv = NULL;
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (N/A)",
+ i,
+ halo_state_str(priv->child_up[i]));
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (%"PRIi64" ms)",
+ i,
+ halo_state_str(priv->child_up[i]),
+ priv->child_latency[i]);
+ }
+ }
+}
+
+static void
+_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, const int64_t halo_max_latency_msec,
+ int32_t *event, int64_t *child_latency_msec,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int best_down_child = 0;
+ uint64_t latency_samples = 0;
+
+ priv = this->private;
+
+ /* Base it off the _minimum_ latency we've ever seen */
+ *child_latency_msec = child_xlator->client_latency.min / 1000.0;
+ latency_samples = child_xlator->client_latency.count;
+ priv->child_latency[idx] = *child_latency_msec;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] == 1) {
+ up_children++;
+ }
+ }
+
+ /* Don't do anything until you have some minimum numbner of
+ * latency samples */
+ if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) {
+ gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient "
+ " number of latency samples (%" PRIu64
+ " < %d), halo in-active.",
+ latency_samples, priv->halo_min_samples);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)"
+ " up_count %d (min %d) enabled %s",
+ idx, child_xlator ? child_xlator->name : "<null>",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ up_children,
+ priv->halo_min_replicas,
+ child_halo_enabled ? "true" : "false");
+
+ /*
+ * Case 1: This child's latency exceeds the maximum allowable
+ * for this halo.
+ */
+ if (child_halo_enabled &&
+ *child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 &&
+ up_children > priv->halo_min_replicas) {
+ if (find_worst_up_child (this) == idx) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "marking child down, "
+ "min_replicas (%d) still "
+ "satisfied.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_min_replicas);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ /*
+ * Case 2: Child latency is within halo and currently marked down,
+ * mark it up.
+ */
+ } else if ((child_halo_enabled == _gf_false ||
+ *child_latency_msec <= halo_max_latency_msec) &&
+ priv->child_up[idx] == 0) {
+ if (child_halo_enabled == _gf_false ||
+ up_children < priv->halo_max_replicas) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%ld ms) "
+ "below halo threshold (%ld) or halo is "
+ "disabled, marking child up.",
+ *child_latency_msec,
+ halo_max_latency_msec);
+ *event = GF_EVENT_CHILD_UP;
+ } else {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.", idx,
+ priv->halo_max_replicas);
+ }
+ /*
+ * Case 3: Child latency is within halo,and currently marked up,
+ * mark it down if it's the highest latency child and the
+ * number of up children is greater than halo_max_replicas.
+ * UNLESS you are an SHD in which case do nothing.
+ */
+ } else if ((child_halo_enabled == _gf_true &&
+ *child_latency_msec <= halo_max_latency_msec) &&
+ priv->child_up[idx] == 1) {
+ if (find_worst_up_child (this) == idx &&
+ up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "but halo_max_replicas (%d) exceeded, "
+ "marking child down.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_max_replicas);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ }
+
+ if (*event != GF_EVENT_CHILD_PING &&
+ gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:");
+ dump_halo_states (this);
+ }
+}
+
+void
+_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t halo_max_latency_msec,
+ int32_t *event, int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int worst_up_child = -1;
+ gf_boolean_t was_down = _gf_false;
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ /*
+ * Track the fact we did this, we may need to repeal this
+ * if we later decide to mark this brick down.
+ */
+ was_down = _gf_true;
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (child_halo_enabled == _gf_true &&
+ up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] >
+ halo_max_latency_msec) {
+ if (was_down == _gf_true)
+ priv->event_generation--;
+ *call_psh = 0;
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Marking child %d down, "
+ "doesn't meet halo threshold "
+ "(%ld), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child,
+ halo_max_latency_msec,
+ priv->halo_min_replicas);
+ goto out;
+ }
+ }
+ if (priv->halo_enabled &&
+ up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ if (was_down == _gf_true)
+ priv->event_generation--;
+ *call_psh = 0;
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
+ }
+ priv->child_up[worst_up_child] = 0;
+ gf_log (this->name, GF_LOG_INFO,
+ "Marking child %d down, "
+ "up_children (%d) > "
+ "halo_max_replicas (%d)",
+ worst_up_child,
+ up_children,
+ priv->halo_max_replicas);
+ up_children--;
+ goto out;
+ }
+out:
+ if (up_children == 1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ } else {
+ *event = GF_EVENT_CHILD_MODIFIED;
+ }
+
+ priv->last_event[idx] = *event;
+
+ if (gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "New halo states:");
+ dump_halo_states (this);
+ }
+}
+
+void
+_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
+ int idx, int64_t child_latency_msec,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+ gf_boolean_t swap_child = _gf_false;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to
+ * indicate the child is really disconnected.
+ */
+ if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) {
+ priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY;
+ }
+ priv->child_up[idx] = 0;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ best_down_child = find_best_down_child (this);
+ if (child_halo_enabled == _gf_true) {
+ if (up_children < priv->halo_min_replicas &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ else if (up_children < priv->halo_max_replicas &&
+ priv->child_latency[best_down_child] <=
+ halo_max_latency_msec &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ }
+
+ if (swap_child) {
+ if (best_down_child >= 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Swapping out child %d for "
+ "child %d to satisfy "
+ "halo_min_replicas (%d).",
+ idx, best_down_child,
+ priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "All subvolumes are down. Going "
+ "offline until atleast one of them "
+ "comes back up.");
+ } else {
+ *event = GF_EVENT_CHILD_MODIFIED;
+ }
+ priv->last_event[idx] = *event;
+
+ if (gf_log_get_loglevel () >= GF_LOG_DEBUG) {
+ gf_log (this->name, GF_LOG_DEBUG, "New halo states:");
+ dump_halo_states (this);
+ }
+}
+
+int64_t
+_afr_get_halo_latency (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
+
+ priv = this->private;
+
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec =
+ priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld",
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
+}
+
+
int32_t
afr_notify (xlator_t *this, int32_t event,
void *data, void *data2)
{
+ xlator_t *child_xlator = NULL;
afr_private_t *priv = NULL;
int i = -1;
- int up_children = 0;
- int down_children = 0;
int propagate = 0;
int had_heard_from_all = 0;
int have_heard_from_all = 0;
int idx = -1;
int ret = -1;
int call_psh = 0;
+ int up_child = -1;
+ uint64_t latency_samples = 0;
dict_t *input = NULL;
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY;
+ gf_boolean_t child_halo_enabled = _gf_false;
+ child_xlator = (xlator_t *)data;
priv = this->private;
if (!priv)
@@ -4347,8 +4991,9 @@ afr_notify (xlator_t *this, int32_t event,
* that we could end up issuing N lookups to the first subvolume, and
* O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
*/
+ priv->did_local_discovery = _gf_false;
priv->did_discovery = _gf_false;
-
+ latency_samples = child_xlator->client_latency.count;
/* parent xlators dont need to know about every child_up, child_down
* because of afr ha. If all subvolumes go down, child_down has
@@ -4359,7 +5004,7 @@ afr_notify (xlator_t *this, int32_t event,
* subsequent revalidate lookup happens on all the dht's subvolumes
* which triggers afr self-heals if any.
*/
- idx = find_child_index (this, data);
+ idx = find_child_index (this, child_xlator);
if (idx < 0) {
gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
"Received child_up from invalid subvolume");
@@ -4368,6 +5013,28 @@ afr_notify (xlator_t *this, int32_t event,
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
this);
+
+ if (!priv->halo_enabled ||
+ latency_samples < priv->halo_min_samples) {
+ child_halo_enabled = _gf_false;
+ halo_max_latency_msec = INT64_MAX;
+ } else {
+ child_halo_enabled = _gf_true;
+ halo_max_latency_msec = _afr_get_halo_latency (this);
+ }
+
+ if (event == GF_EVENT_CHILD_PING) {
+ /* Calculates the child latency and sets event
+ */
+ LOCK (&priv->lock);
+ {
+ _afr_handle_ping_event (this, child_xlator, idx,
+ halo_max_latency_msec, &event,
+ &child_latency_msec, child_halo_enabled);
+ }
+ UNLOCK (&priv->lock);
+ }
+
if (event == GF_EVENT_TRANSLATOR_OP) {
LOCK (&priv->lock);
{
@@ -4394,52 +5061,16 @@ afr_notify (xlator_t *this, int32_t event,
propagate = 1;
break;
case GF_EVENT_CHILD_UP:
- /*
- * This only really counts if the child was never up
- * (value = -1) or had been down (value = 0). See
- * comment at GF_EVENT_CHILD_DOWN for a more detailed
- * explanation.
- */
- if (priv->child_up[idx] != 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 1;
-
- call_psh = 1;
- up_children = __afr_get_up_children_count (priv);
- if (up_children == 1) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SUBVOL_UP,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
- } else {
- event = GF_EVENT_CHILD_MODIFIED;
- }
-
- priv->last_event[idx] = event;
-
+ _afr_handle_child_up_event (this, child_xlator,
+ idx, halo_max_latency_msec, &event, &call_psh,
+ &up_child, child_halo_enabled);
break;
case GF_EVENT_CHILD_DOWN:
- if (priv->child_up[idx] == 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 0;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 0)
- down_children++;
- if (down_children == priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
- } else {
- event = GF_EVENT_SOME_CHILD_DOWN;
- }
-
- priv->last_event[idx] = event;
-
+ _afr_handle_child_down_event (this, child_xlator, idx,
+ child_latency_msec, halo_max_latency_msec,
+ &event, &call_psh, &up_child,
+ child_halo_enabled);
break;
case GF_EVENT_CHILD_CONNECTING:
@@ -4466,7 +5097,6 @@ afr_notify (xlator_t *this, int32_t event,
had come up, propagate CHILD_UP, but only this time
*/
event = GF_EVENT_CHILD_DOWN;
- up_children = __afr_get_up_children_count (priv);
for (i = 0; i < priv->child_count; i++) {
if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
event = GF_EVENT_CHILD_UP;
@@ -4542,7 +5172,7 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
local->call_count = AFR_COUNT (local->child_up, priv->child_count);
if (local->call_count == 0) {
gf_msg (THIS->name, GF_LOG_INFO, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN, "no subvolumes up");
+ AFR_MSG_ALL_SUBVOLS_DOWN, "no bricks up");
if (op_errno)
*op_errno = ENOTCONN;
goto out;
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 2b369ca3c68..a917bc08ae0 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1538,6 +1538,15 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
/*
+ * Heal daemons don't have IO threads ... and as a result they
+ * send this getxattr down and eventually crash :(
+ */
+ if (strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
* Special xattrs which need responses from all subvols
*/
if (afr_is_special_xattr (name, &cbk, 0)) {
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7f7962013d7..c7d6261b110 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -46,7 +46,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_spbc_timeout_t,
gf_afr_mt_spb_status_t,
gf_afr_mt_empty_brick_t,
- gf_afr_mt_end
+ gf_afr_mt_child_latency_t,
+ gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index adf5ab20a6c..629f1c6a7da 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -664,6 +664,20 @@ out:
}
+static int
+replies_are_same (struct afr_reply *replies, int i, int k)
+{
+ if (replies[k].poststat.ia_mtime != replies[i].poststat.ia_mtime) {
+ return _gf_false;
+ }
+ if (replies[k].poststat.ia_size != replies[i].poststat.ia_size) {
+ return _gf_false;
+ }
+
+ return gf_uuid_compare (replies[i].poststat.ia_gfid,
+ replies[k].poststat.ia_gfid) == 0;
+}
+
int
afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,
inode_t *inode)
@@ -683,14 +697,10 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,
priv->children[i]->name,
replies[i].poststat.ia_mtime,
replies[i].poststat.ia_size,
- uuid_utoa (inode->gfid));
+ uuid_utoa (replies[i].poststat.ia_gfid));
vote_count = 0;
- for (k = 0; k < priv->child_count; k++) {
- if ((replies[k].poststat.ia_mtime ==
- replies[i].poststat.ia_mtime) &&
- (replies[k].poststat.ia_size ==
- replies[i].poststat.ia_size)
- ) {
+ for (k = 1; k < priv->child_count; k++) {
+ if (replies_are_same (replies, i, k)) {
vote_count++;
}
}
@@ -724,7 +734,7 @@ afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode)
priv->children[i]->name,
replies[i].poststat.ia_mtime,
replies[i].poststat.ia_mtime_nsec,
- uuid_utoa (inode->gfid));
+ uuid_utoa (replies[i].poststat.ia_gfid));
if (replies[i].poststat.ia_mtime > cmp_mtime) {
cmp_mtime = replies[i].poststat.ia_mtime;
cmp_mtime_nsec =
@@ -764,7 +774,7 @@ afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode)
priv->children[i]->name,
replies[i].poststat.ia_ctime,
replies[i].poststat.ia_ctime_nsec,
- uuid_utoa (inode->gfid));
+ uuid_utoa (replies[i].poststat.ia_gfid));
if (replies[i].poststat.ia_ctime > cmp_ctime) {
cmp_ctime = replies[i].poststat.ia_ctime;
cmp_ctime_nsec =
@@ -802,7 +812,7 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode)
"file size = %lu for gfid %s",
priv->children[i]->name,
replies[i].poststat.ia_size,
- uuid_utoa (inode->gfid));
+ uuid_utoa (replies[i].poststat.ia_gfid));
if (replies[i].poststat.ia_size > cmp_sz) {
cmp_sz = replies[i].poststat.ia_size;
fav_child = i;
@@ -901,7 +911,7 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
"data in file (gfid:%s) by %s (%lu bytes @ %s mtime, "
"%s ctime).",
priv->children[fav_child]->name,
- uuid_utoa (inode->gfid),
+ uuid_utoa (replies[fav_child].poststat.ia_gfid),
policy_str,
replies[fav_child].poststat.ia_size,
mtime_str,
@@ -929,6 +939,7 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies,
+
afr_transaction_type type)
{
afr_local_t *local = NULL;
@@ -1201,7 +1212,6 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
}
}
-
/* count the number of dirty fops witnessed */
for (i = 0; i < priv->child_count; i++)
witness[i] += dirty[i];
@@ -1209,6 +1219,67 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
return 0;
}
+/*
+ * This function will examine a reply and look for a PGFID xattr
+ * and if found will record this in the frame's local struct.
+ *
+ * This can then be used to fall-back to healing the parent
+ * directory in cases where metadata/data healing isn't yet
+ * possible because an entry heal of the parent directory has not
+ * yet taken place.
+ *
+ * This is critical for a couple reasons:
+ * 1. General healing predictability - When the SHD
+ * attempts to heal a given GFID, it should be able
+ * to do so without having to wait for some other
+ * dependent heal to take place.
+ * 2. Reliability - In some cases the parent directory
+ * may require healing, but the req'd entry in the
+ * indices/xattrop directory may not exist
+ * (e.g. bugs/crashes etc). This feature removes
+ *
+ */
+void
+_afr_set_heal_pgfid_from_reply (xlator_t *this, afr_local_t *local,
+ struct afr_reply reply)
+{
+ data_pair_t *trav = reply.xdata->members_list;
+ uuid_t *pgfid = NULL;
+ int32_t ret = 0;
+ int32_t pgfid_prefix_len = sizeof (PGFID_XATTR_KEY_PREFIX) - 1;
+ char *pgfid_str = NULL;
+ data_t *ancestry_path_data = NULL;
+ char *ancestry_path = "Unknown";
+
+ pgfid = &local->heal_pgfid;
+
+ while (trav) {
+ if (!strncmp (PGFID_XATTR_KEY_PREFIX, trav->key,
+ pgfid_prefix_len)) {
+ pgfid_str = trav->key + pgfid_prefix_len;
+ ret = gf_uuid_parse (pgfid_str, *pgfid);
+ break;
+ }
+ trav = trav->next;
+ }
+
+ if (!ret && !gf_uuid_is_null (*pgfid)) {
+ if (!dict_lookup (reply.xdata,
+ "glusterfs.ancestry.path",
+ &ancestry_path_data)) {
+ ancestry_path = data_to_str (
+ ancestry_path_data);
+ /* Allocation free'd on local destroy */
+ local->heal_ancestry_path =
+ gf_strdup (ancestry_path);
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found pgfid (%s) for %s",
+ uuid_utoa (*pgfid),
+ ancestry_path);
+ }
+}
+
void
afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
int source, unsigned char *sources,
@@ -1239,7 +1310,6 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
}
}
}
-
if (ret < 0) {
status = "Failed";
loglevel = GF_LOG_DEBUG;
@@ -1777,6 +1847,8 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
gf_boolean_t *entry_selfheal)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
inode_t *inode = NULL;
int i = 0;
int valid_cnt = 0;
@@ -1785,6 +1857,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
int ret = -1;
priv = this->private;
+ local = frame->local;
inode = afr_inode_find (this, gfid);
if (!inode)
@@ -1802,6 +1875,10 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (replies[i].op_ret == -1)
continue;
+ if (gf_uuid_is_null(local->heal_pgfid))
+ _afr_set_heal_pgfid_from_reply (this,
+ frame->local, replies[i]);
+
/* The data segment of the changelog can be non-zero to indicate
* the directory needs a full heal. So the check below ensures
* it's not a directory before setting the data_selfheal boolean.
@@ -1814,8 +1891,11 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
afr_is_metadata_set (this, replies[i].xdata))
*metadata_selfheal = _gf_true;
- if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata))
- *entry_selfheal = _gf_true;
+ if ((!priv->shd.iamshd && AFR_IS_ROOT_GFID (gfid) &&
+ priv->did_discovery == _gf_false) ||
+ (entry_selfheal &&
+ afr_is_entry_set (this, replies[i].xdata)))
+ *entry_selfheal = _gf_true;
valid_cnt++;
if (valid_cnt == 1) {
@@ -1831,8 +1911,14 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
(int) replies[i].poststat.ia_type,
priv->children[i]->name,
uuid_utoa (replies[i].poststat.ia_gfid));
- ret = -EIO;
- goto out;
+
+ if (priv->gfid_splitbrain_forced_heal &&
+ metadata_selfheal) {
+ *metadata_selfheal = _gf_true;
+ } else {
+ ret = -EIO;
+ goto out;
+ }
}
if (!IA_EQUAL (first, replies[i].poststat, uid)) {
@@ -1875,6 +1961,15 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
*metadata_selfheal = _gf_true;
}
+ /* Force entry healing of directories for SHDs regardless
+ * of the entry healing portion of the change log.
+ */
+ if (IA_ISDIR(first.ia_type) && priv->shd.iamshd &&
+ IA_EQUAL (first, replies[i].poststat, type) &&
+ entry_selfheal) {
+ *entry_selfheal = _gf_true;
+ }
+
if (IA_ISREG(first.ia_type) &&
!IA_EQUAL (first, replies[i].poststat, size)) {
gf_msg_debug (this->name, 0,
@@ -1970,6 +2065,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
{
int ret = 0;
int i = 0;
+ int source_count = 0;
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
int **changelog = NULL;
@@ -1990,12 +2086,27 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
goto out;
}
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- ret |= afr_selfheal_post_op (frame, this, inode, i, xattr,
+ /* Pre-compute how many sources we have, if we made it in here
+ * without any sources defined, we are doing a conservative
+ * merge
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source_count++;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ /* If there are no sources we are doing a conservative
+ * merge. In such a case ensure we mark the changelog
+ * on all replicas.
+ */
+ if (!sources[i] && source_count) {
+ continue;
+ }
+ ret |= afr_selfheal_post_op (frame, this, inode, i, xattr,
NULL);
- }
+ }
out:
if (changelog)
afr_matrix_cleanup (changelog, priv->child_count);
@@ -2029,6 +2140,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
&data_selfheal,
&metadata_selfheal,
&entry_selfheal);
+
if (ret)
goto out;
@@ -2075,10 +2187,19 @@ int
afr_selfheal (xlator_t *this, uuid_t gfid)
{
int ret = -1;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
+ gf_boolean_t tried_parent = _gf_false;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ char *ancestry_path = "Unknown";
+ char *pgfid_str = NULL;
+ char *gfid_str = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+heal_gfid:
+ frame = afr_frame_create (this);
- frame = afr_frame_create (this);
if (!frame)
return ret;
@@ -2087,6 +2208,47 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
ret = afr_selfheal_do (frame, this, gfid);
+ if (priv->pgfid_self_heal == _gf_true &&
+ tried_parent == _gf_false && (ret != 0 || ret != 2) &&
+ !gf_uuid_is_null (local->heal_pgfid)) {
+ tried_parent = _gf_true;
+ pgfid_str = alloca (strlen (UUID0_STR) + 1);
+ gfid_str = alloca (strlen (UUID0_STR) + 1);
+ uuid_utoa_r (local->heal_pgfid, pgfid_str);
+ uuid_utoa_r (gfid, gfid_str);
+ if (local->heal_ancestry_path)
+ ancestry_path = local->heal_ancestry_path;
+ gf_log (this->name, GF_LOG_INFO,
+ "PGFID Healing - Heal failed for %s (%s), "
+ "but found parent gfid (%s), attempting to heal "
+ "parent directory by gfid.",
+ gfid_str,
+ ancestry_path,
+ pgfid_str);
+ ret = afr_selfheal (this, local->heal_pgfid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "PGFID Healing - Healing of parent gfid "
+ "(%s) unsuccessful! Healing of %s (%s) "
+ "failed.",
+ pgfid_str,
+ gfid_str,
+ ancestry_path);
+ } else {
+ gf_log (this->name, GF_LOG_INFO,
+ "PGFID Healing - Healing of parent gfid %s "
+ "successful! Re-attempting heal of %s (%s).",
+ pgfid_str,
+ gfid_str,
+ ancestry_path);
+ if (frame) {
+ AFR_STACK_DESTROY (frame);
+ frame = NULL;
+ }
+ goto heal_gfid;
+ }
+ }
+
if (frame)
AFR_STACK_DESTROY (frame);
@@ -2230,3 +2392,19 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+void
+afr_sh_get_source_by_policy (xlator_t *this,
+ unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies, inode_t *inode)
+{
+ int fav_child = -1;
+ char *policy_str;
+
+ fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+ &policy_str);
+ sources[fav_child] = 1;
+ healed_sinks[fav_child] = 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index cf03a9ec680..c1e945bfd82 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -324,7 +324,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_private_t *priv = NULL;
off_t off = 0;
- size_t block = 128 * 1024;
+ size_t block = 0;
int type = AFR_SELFHEAL_DATA_FULL;
int ret = -1;
call_frame_t *iter_frame = NULL;
@@ -336,6 +336,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
healed_sinks[ARBITER_BRICK_INDEX] = 0;
}
+ block = 128 * 1024 * priv->data_self_heal_window_size;
+
type = afr_data_self_heal_type_get (priv, healed_sinks, source,
replies);
@@ -716,7 +718,6 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto unlock;
ret = 0;
-
}
unlock:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
@@ -752,7 +753,6 @@ skip_undo_pending:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
data_lock);
out:
-
if (did_sh)
afr_log_selfheal (fd->inode->gfid, this, ret, "data", source,
sources, healed_sinks);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 5b536b0ded8..25f8ea313aa 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -16,16 +16,6 @@
#include "afr-messages.h"
#include "syncop-utils.h"
-/* Max file name length is 255 this filename is of length 256. No file with
- * this name can ever come, entry-lock with this name is going to prevent
- * self-heals from older versions while the granular entry-self-heal is going
- * on in newer version.*/
-#define LONG_FILENAME "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-
static int
afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name,
inode_t *inode, int child, struct afr_reply *replies)
@@ -66,7 +56,30 @@ afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name,
ret = syncop_unlink (subvol, &loc, NULL, NULL);
break;
}
- }
+ /* Handle edge case where directories exist in a partially
+ * created state: empty, without a gfid assigned. We need to
+ * remove these bad dirs so the normal entry heal process
+ * can take place.
+ */
+ } else if (replies[child].valid &&
+ replies[child].op_ret == -1 &&
+ replies[child].op_errno == ENODATA &&
+ gf_uuid_is_null (replies[child].poststat.ia_gfid)) {
+ if (replies[child].poststat.ia_type == IA_INVAL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging orphaned (gfid-less) dir "
+ "%s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid,
+ g), subvol->name);
+ /* We will only do this for _directories_, and this
+ * will only succeed for directories _without_
+ * data. The file case is handled well already
+ * through the metadata self-heal process.
+ */
+ ret = syncop_rmdir (subvol, &loc, 1, NULL, NULL);
+ }
+ }
loc_wipe (&loc);
@@ -299,11 +312,12 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
+ /* Returning EIO here isn't needed if GFID forced heal is
+ * enabled.
+ */
/* In case of a gfid or type mismatch on the entry, return -1.*/
- ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies,
- fd->inode->gfid,
- name, source);
-
+ ret = afr_selfheal_detect_gfid_and_type_mismatch (this,
+ replies, fd->inode->gfid, name, source);
if (ret < 0)
return ret;
@@ -314,10 +328,20 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (replies[i].op_errno != ENOENT)
continue;
- ret = afr_selfheal_recreate_entry (frame, i, source, sources,
- fd->inode, name, inode,
- replies);
- }
+ /* Re-create the entry in the event the child
+ * does not have it, or the entry does not have
+ * a gfid. In the latter case we'll only do
+ * this for now if it's directory, this can be
+ * widened to include files at a later time.
+ */
+ if (replies[i].op_errno == ENOENT ||
+ (replies[i].op_errno == ENODATA &&
+ gf_uuid_is_null (replies[i].poststat.ia_gfid))) {
+ ret = afr_selfheal_recreate_entry (
+ frame, i, source, sources, fd->inode, name, inode,
+ replies);
+ }
+ }
return ret;
}
@@ -435,7 +459,9 @@ __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count || afr_does_witness_exist (this, witness)) {
+ || !sources_count || afr_does_witness_exist (this, witness)
+ || (sources_count == priv->child_count &&
+ priv->did_discovery == _gf_false)) {
memset (sources, 0, sizeof (*sources) * priv->child_count);
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
@@ -652,7 +678,6 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
gf_boolean_t mismatch = _gf_false;
- afr_local_t *iter_local = NULL;
afr_local_t *local = NULL;
loc_t loc = {0,};
@@ -685,10 +710,34 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
!strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))
continue;
+ /* Common Case: First do a cheap normal entry_dirent
+ * flow */
ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
+
+ /* Edge Case: Do name heal to fix gfid split
+ * brains and other damage to directory
+ * entries.
+ */
+ if (ret) {
+ /* If the cheap flow didn't work, let's head
+ * into the name self-heal flow. Here we'll
+ * inspect for GFID split-brains and fix if
+ * found. Then send it back to the normal
+ * entry_dirent flow.
+ */
+ ret = afr_selfheal_name (this, fd->inode->gfid,
+ entry->d_name, NULL);
+ if (!ret) {
+ ret = afr_selfheal_entry_dirent (
+ iter_frame, this, fd,
+ entry->d_name, loc.inode, subvol,
+ local->need_full_crawl);
+ }
+ }
+
AFR_STACK_RESET (iter_frame);
if (iter_frame->local == NULL) {
ret = -ENOTCONN;
@@ -1045,45 +1094,22 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_tie_breaker_entrylk (frame, this, inode,
priv->sh_domain, NULL,
locked_on);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "entry self-heal as only %d sub-volumes could "
- "be locked in %s domain",
- uuid_utoa (fd->inode->gfid), ret,
- priv->sh_domain);
- /* Either less than two subvols available, or another
- selfheal (from another server) is in progress. Skip
- for now in any case there isn't anything to do.
- */
- ret = -ENOTCONN;
- goto unlock;
- }
+ if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ gf_msg_debug (this->name, 0, "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa (fd->inode->gfid), ret,
+ priv->sh_domain);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry (frame, this, fd, locked_on);
- if (!granular_locks) {
- ret = afr_selfheal_tryentrylk (frame, this, inode,
- this->name, LONG_FILENAME,
- long_name_locked);
- }
- {
- if (!granular_locks && ret < 1) {
- gf_msg_debug (this->name, 0, "%s: Skipping"
- " entry self-heal as only %d "
- "sub-volumes could be "
- "locked in special-filename "
- "domain",
- uuid_utoa (fd->inode->gfid),
- ret);
- ret = -ENOTCONN;
- goto unlock;
- }
- ret = __afr_selfheal_entry (frame, this, fd, locked_on);
- }
- if (!granular_locks)
- afr_selfheal_unentrylk (frame, this, inode, this->name,
- LONG_FILENAME, long_name_locked,
- NULL);
- }
unlock:
afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL,
locked_on, NULL);
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index db1b1cc889f..4570ace7ef7 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -216,6 +216,17 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
sources_count = AFR_COUNT (sources, priv->child_count);
+ /* __afr_selfheal_metadata_prepare tinkers with the state
+ * of healed_sinks pre-maturely (the source hasn't
+ * actually been finalized yet!), so reset the children
+ * which aren't our source to sinks so we can heal.
+ * I'll leave it to the AFR2 maintainer to fix that code
+ * in the future as they may have had a good reason.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] && locked_on[i])
+ healed_sinks[i] = 1;
+ }
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index b28ce4170f1..9ca56f8bd9d 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -13,6 +13,288 @@
#include "afr-self-heal.h"
#include "afr-messages.h"
+
+
+/*
+ * Helper function to create the destination location for the copy
+ * of the directory entry we are moving out of the way.
+ */
+static int
+_afr_sh_create_unsplit_loc (struct afr_reply *replies, const int child_idx,
+ loc_t *loc, loc_t *unsplit_loc)
+{
+ int ret = 0;
+ int new_path_len = 0;
+ int new_name_len = 0;
+ char *new_path = NULL;
+ char *new_name = NULL;
+ char *tmp_gfid_str;
+ const char *filename = NULL;
+ uuid_t rand_uuid;
+
+ tmp_gfid_str = alloca (sizeof (UUID0_STR));
+
+ /*
+ * All of these allocations will be cleaned up
+ * @ afr_sh_gfid_unsplit_rename_done via loc_wipe.
+ */
+ if (loc_copy (unsplit_loc, loc)) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ inode_unref (unsplit_loc->inode);
+ unsplit_loc->inode = inode_new (loc->inode->table);
+ unsplit_loc->parent = inode_ref (loc->parent);
+ gf_uuid_copy (unsplit_loc->inode->gfid,
+ replies[child_idx].poststat.ia_gfid);
+ unsplit_loc->inode->ia_type = loc->inode->ia_type;
+
+ gf_uuid_generate (rand_uuid);
+ /* Note: Use re-entrant version of uuid_utoa! */
+ tmp_gfid_str = uuid_utoa_r (rand_uuid, tmp_gfid_str);
+
+ /* Copy the GFIDs, file + parent directory */
+ gf_uuid_copy (unsplit_loc->gfid, rand_uuid);
+ gf_uuid_copy (unsplit_loc->pargfid,
+ replies[child_idx].postparent.ia_gfid);
+
+ filename = loc->name;
+
+ /*
+ * New path: Add 11 for null + ".unsplit_" + "_". We _could_ nuke
+ * tmp_gfid_str entirely here, iff we assume the uuid_utoa
+ * formatting to _never_change. If we assume this we can just add
+ * 32 to the length and call uuid_utoa directly in the snprintf.
+ */
+ new_path_len = strlen (filename) + strlen (tmp_gfid_str) + 11;
+ new_path = GF_CALLOC (1, new_path_len, gf_common_mt_char);
+ if (!new_path) {
+ ret = ENOMEM;
+ goto err;
+ }
+ snprintf (new_path, new_path_len, ".unsplit_%s_%s", tmp_gfid_str,
+ filename);
+ unsplit_loc->path = new_path;
+
+ /* New name: Add 11 for null + ".unsplit_" + "_" */
+ new_name_len = strlen (loc->name) + strlen (tmp_gfid_str) + 11;
+ new_name = GF_CALLOC (1, new_name_len, gf_common_mt_char);
+ if (!new_name) {
+ ret = ENOMEM;
+ goto err;
+ }
+ snprintf (new_name, new_name_len, ".unsplit_%s_%s", tmp_gfid_str,
+ loc->name);
+ unsplit_loc->name = new_name;
+
+ return 0;
+err:
+ GF_FREE (new_path);
+ GF_FREE (new_name);
+ return ret;
+}
+
+static int
+_afr_gfid_unsplit_rename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iatt *buf, struct iatt *preoldparent,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rename entry %s/%s failed, on child %d reason, %s",
+ uuid_utoa (local->loc.pargfid),
+ local->loc.name, child_index, strerror (op_errno));
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "GFID unsplit successful on %s/%s, on child %d",
+ uuid_utoa (local->loc.pargfid), local->loc.name, child_index);
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+int
+__afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on,
+ struct afr_reply *replies, inode_t *inode,
+ loc_t *loc)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ loc_t *unsplit_loc;
+ unsigned int i = 0;
+ unsigned int split_count = 0;
+ unsigned char *rename_list;
+ int ret = 0;
+
+ frame = afr_frame_create (this);
+
+ local = frame->local; // Local variables for our frame
+ priv = this->private; // xlator specific variables
+ rename_list = alloca0 (priv->child_count);
+
+ if (loc_copy (&local->loc, loc)) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ /* Pre-compute the number of rename calls we will be doing */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] &&
+ !gf_uuid_is_null (replies[i].poststat.ia_gfid) &&
+ gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) {
+ split_count++;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "Found %d split-brained gfid's.",
+ split_count);
+
+ local->unsplit_locs = GF_CALLOC (priv->child_count,
+ sizeof (*unsplit_loc), gf_afr_mt_loc_t);
+ if (!local->unsplit_locs) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ afr_local_replies_wipe (local, priv);
+ local->call_count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ unsplit_loc = &local->unsplit_locs[i];
+ if (locked_on[i] && local->child_up[i] &&
+ replies[i].op_errno != ENOENT &&
+ !gf_uuid_is_null (replies[i].poststat.ia_gfid) &&
+ gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) {
+ ret = _afr_sh_create_unsplit_loc (replies, i,
+ loc, unsplit_loc);
+ gf_log (this->name, GF_LOG_INFO, "Renaming child %d to "
+ " %s/%s to resolve gfid split-brain.", i,
+ uuid_utoa (unsplit_loc->pargfid),
+ unsplit_loc->name);
+ rename_list[i] = 1;
+ /* frame, rfn, cky, obj, fn, params */
+ STACK_WIND_COOKIE (frame,
+ _afr_gfid_unsplit_rename_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rename,
+ loc, unsplit_loc, NULL);
+ local->call_count++;
+ }
+ }
+ syncbarrier_wait (&local->barrier, local->call_count);
+
+out:
+ for (i = 0; i < priv->child_count; i++) {
+ if (rename_list[i])
+ loc_wipe (&local->unsplit_locs[i]);
+ }
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+ return ret;
+}
+
+int
+__afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies, void *gfid,
+ unsigned char *locked_on)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ loc_t loc = {0, };
+ call_frame_t *new_frame = NULL;
+ afr_local_t *new_local = NULL;
+ int fav_child = -1;
+ unsigned char *fav_gfid;
+ char *policy_str;
+
+ priv = this->private;
+
+ new_frame = afr_frame_create (this);
+ if (!new_frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ new_local = new_frame->local;
+
+ gf_uuid_copy (parent->gfid, pargfid);
+
+ loc.parent = inode_ref (parent);
+ loc.inode = inode_ref (inode);
+ gf_uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+
+ /*
+ * Ok, go find our favorite child by one of the active policies:
+ * majority -> ctime -> mtime -> size -> predefined
+ * we'll use this gfid as the "real" one.
+ */
+ fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+ &policy_str);
+ if (fav_child == -1) { /* No policies are in place, bail */
+ gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID "
+ "split brain, there are no favorite child policies "
+ "set.");
+ ret = -EIO;
+ goto out;
+ }
+ fav_gfid = replies[fav_child].poststat.ia_gfid;
+ gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid "
+ "split-brain. GFID is %s.", fav_child, uuid_utoa (fav_gfid));
+
+ gf_uuid_copy (loc.gfid, fav_gfid);
+ ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies,
+ inode, &loc);
+
+ if (ret)
+ goto out;
+
+ xdata = dict_new ();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_static_bin (xdata, "gfid-req", fav_gfid, 16);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Clear out old replies here and wind lookup on all locked
+ * subvolumes to achieve two things:
+ * a. gfid heal on those subvolumes that do not have gfid associated
+ * with the inode, and
+ * b. refresh replies, which can be consumed by
+ * __afr_selfheal_name_impunge().
+ */
+ afr_replies_wipe (replies, priv->child_count);
+ /* This sends out lookups to all bricks and blocks once we have
+ * them.
+ */
+ AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup,
+ &loc, xdata);
+ afr_replies_copy (replies, new_local->replies, priv->child_count);
+out:
+ loc_wipe (&loc);
+ if (xdata)
+ dict_unref (xdata);
+ if (new_frame)
+ AFR_STACK_DESTROY (new_frame);
+
+ return ret;
+}
+
int
__afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
@@ -28,6 +310,7 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,
loc_t loc = {0, };
call_frame_t *new_frame = NULL;
afr_local_t *new_local = NULL;
+ int i;
priv = this->private;
@@ -83,6 +366,25 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,
* __afr_selfheal_name_impunge().
*/
+ gf_log (this->name, GF_LOG_INFO,
+ "smashing gfid to %s", uuid_utoa(gfid));
+
+ ia_type_t ia_type = replies[0].poststat.ia_type;
+ for (i = 1; i < priv->child_count; ++i) {
+ if (replies[i].poststat.ia_type != ia_type) {
+ if (replies[i].poststat.ia_type == IA_INVAL) {
+ continue;
+ }
+ gf_log (this->name, GF_LOG_WARNING,
+ "type[%d] = %d (not %d)", i,
+ replies[i].poststat.ia_type, ia_type);
+ if (ia_type != IA_INVAL) {
+ ret = -EIO;
+ goto out;
+ }
+ ia_type = replies[i].poststat.ia_type;
+ }
+ }
AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup,
&loc, xdata);
@@ -266,52 +568,6 @@ afr_selfheal_name_need_heal_check (xlator_t *this, struct afr_reply *replies)
return need_heal;
}
-static int
-afr_selfheal_name_type_mismatch_check (xlator_t *this, struct afr_reply *replies,
- int source, unsigned char *sources,
- uuid_t pargfid, const char *bname)
-{
- int i = 0;
- int type_idx = -1;
- ia_type_t inode_type = IA_INVAL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if (replies[i].poststat.ia_type == IA_INVAL)
- continue;
-
- if (inode_type == IA_INVAL) {
- inode_type = replies[i].poststat.ia_type;
- type_idx = i;
- continue;
- }
-
- if (sources[i] || source == -1) {
- if ((sources[type_idx] || source == -1) &&
- (inode_type != replies[i].poststat.ia_type)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN,
- "Type mismatch for <gfid:%s>/%s: "
- "%d on %s and %d on %s",
- uuid_utoa(pargfid), bname,
- replies[i].poststat.ia_type,
- priv->children[i]->name,
- replies[type_idx].poststat.ia_type,
- priv->children[type_idx]->name);
-
- return -EIO;
- }
- inode_type = replies[i].poststat.ia_type;
- type_idx = i;
- }
- }
- return 0;
-}
static int
afr_selfheal_name_gfid_mismatch_check (xlator_t *this, struct afr_reply *replies,
@@ -408,7 +664,10 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
gf_boolean_t source_is_empty = _gf_true;
gf_boolean_t need_heal = _gf_false;
gf_boolean_t is_gfid_absent = _gf_false;
+ gf_boolean_t tried_gfid_unsplit = _gf_false;
+ afr_private_t *priv = NULL;
+ priv = this->private;
need_heal = afr_selfheal_name_need_heal_check (this, replies);
if (!need_heal)
return 0;
@@ -424,18 +683,16 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
return ret;
}
- ret = afr_selfheal_name_type_mismatch_check (this, replies, source,
- sources, pargfid, bname);
- if (ret)
- return ret;
-
+gfid_mismatch_check:
ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source,
sources, &gfid_idx,
pargfid, bname);
- if (ret)
+
+ if (ret && tried_gfid_unsplit) {
return ret;
+ }
- if (gfid_idx == -1) {
+ if (gfid_idx == -1) {
if (!gfid_req || gf_uuid_is_null (gfid_req))
return -1;
gfid = gfid_req;
@@ -443,12 +700,24 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
gfid = &replies[gfid_idx].poststat.ia_gfid;
}
+ if (priv->gfid_splitbrain_forced_heal || ret) {
+ ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid,
+ bname, inode, replies, gfid, locked_on);
+
+ if (ret)
+ return ret;
+
+ tried_gfid_unsplit = _gf_true;
+ goto gfid_mismatch_check;
+ }
+
is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
- ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode,
- replies, gfid, locked_on,
- is_gfid_absent);
- if (ret)
+ ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname,
+ inode, replies, gfid,
+ locked_on, is_gfid_absent);
+ if (ret) {
return ret;
+ }
if (gfid_idx == -1) {
gfid_idx = afr_selfheal_gfid_idx_get (this, replies, sources);
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index c6ac5ebfd1b..4ac1d32f58a 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -58,6 +58,7 @@ typedef struct {
eh_t **statistics;
uint32_t max_threads;
uint32_t wait_qlength;
+ uint32_t halo_max_latency_msec;
} afr_self_heald_t;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index af81b77ddb6..86f667116af 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -143,6 +143,10 @@ reconfigure (xlator_t *this, dict_t *options)
priv->metadata_splitbrain_forced_heal, options, bool,
out);
+ GF_OPTION_RECONF ("gfid-splitbrain-forced-heal",
+ priv->gfid_splitbrain_forced_heal, options, bool,
+ out);
+
GF_OPTION_RECONF ("background-self-heal-count",
priv->background_self_heal_count, options, uint32,
out);
@@ -160,6 +164,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,
bool, out);
+ GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal,
+ options, bool, out);
+
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -176,6 +183,42 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("data-self-heal-algorithm",
priv->data_self_heal_algorithm, options, str, out);
+ GF_OPTION_RECONF ("halo-enabled",
+ priv->halo_enabled, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-failover-enabled",
+ priv->halo_failover_enabled, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-shd-max-latency",
+ priv->shd.halo_max_latency_msec, options, uint32,
+ out);
+
+ GF_OPTION_RECONF ("halo-nfsd-max-latency",
+ priv->nfsd.halo_max_latency_msec, options, uint32,
+ out);
+
+ GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, options, bool,
+ out);
+
+ GF_OPTION_RECONF ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options,
+ uint32, out);
+
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -256,6 +299,7 @@ reconfigure (xlator_t *this, dict_t *options)
if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1)
goto out;
+ priv->did_local_discovery = _gf_false;
priv->did_discovery = _gf_false;
ret = 0;
@@ -327,6 +371,9 @@ init (xlator_t *this)
GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
priv->metadata_splitbrain_forced_heal, bool, out);
+ GF_OPTION_INIT ("gfid-splitbrain-forced-heal",
+ priv->gfid_splitbrain_forced_heal, bool, out);
+
GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);
if (read_subvol) {
priv->read_child = xlator_subvolume_index (this, read_subvol);
@@ -377,6 +424,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-wait-qlength", priv->shd.wait_qlength,
uint32, out);
+ GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out);
+
GF_OPTION_INIT ("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
@@ -396,6 +445,35 @@ init (xlator_t *this)
GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
+ GF_OPTION_INIT ("halo-hybrid-mode",
+ priv->halo_hybrid_mode, bool, out);
+
+ GF_OPTION_INIT ("halo-hybrid-read-max-latency",
+ priv->halo_hybrid_read_max_latency_msec, uint32,
+ out);
+
+ GF_OPTION_INIT ("halo-enabled",
+ priv->halo_enabled, bool, out);
+
+ GF_OPTION_INIT ("halo-failover-enabled",
+ priv->halo_failover_enabled, bool, out);
+
+ GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ uint32, out);
+ GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec,
+ uint32, out);
+ GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32,
+ out);
+ GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32,
+ out);
+ GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32,
+ out);
+
+ GF_OPTION_INIT ("halo-nfsd-max-latency",
+ priv->nfsd.halo_max_latency_msec, uint32, out);
+
+ GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+
GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
@@ -445,17 +523,24 @@ init (xlator_t *this)
priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
gf_afr_mt_char);
- if (!priv->child_up) {
+
+ priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency),
+ child_count,
+ gf_afr_mt_child_latency_t);
+
+ if (!priv->child_up || !priv->child_latency) {
ret = -ENOMEM;
goto out;
}
- for (i = 0; i < child_count; i++)
+ for (i = 0; i < child_count; i++) {
+ priv->child_latency[i] = 0.0;
priv->child_up[i] = -1; /* start with unknown state.
this initialization needed
for afr_notify() to work
reliably
*/
+ }
priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
gf_afr_mt_xlator_t);
@@ -663,6 +748,85 @@ struct volume_options options[] = {
"jobs that can perform parallel heals in the "
"background."
},
+ { .key = {"halo-shd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .description = "Maximum latency for shd halo replication in msec."
+ },
+ { .key = {"halo-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .description = "Enable Halo (geo) replication mode."
+ },
+ { .key = {"halo-failover-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .description = "Enable x-halo failover: will allow failover "
+ "to bricks outside the client or daemons' halo "
+ "in an attempt to satisfy halo-min-replicas."
+ },
+ { .key = {"halo-nfsd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .description = "Maximum latency for nfsd halo replication in msec."
+ },
+ { .key = {"halo-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .description = "Maximum latency for halo replication in msec."
+ },
+ { .key = {"halo-hybrid-mode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enable hybrid sync mounts. When enabled, halo will "
+ "do write FOPs synchronously, and read FOPs will be "
+ "services in-region if the inode is clean/consistent."
+ "If no bricks can be found below "
+ "halo-hybrid-max-read-latency then the best 2 shall "
+ "be selected. This option can be used in "
+ "conjunction with all other halo options."
+ },
+ { .key = {"halo-hybrid-read-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "8",
+ .description = "Maximum latency hybrid mode will use to select "
+ "children for read FOPs. Don't tune this unless "
+ "you really know what you are doing (i.e. you've "
+ "read/understand the associated source code)."
+ },
+ { .key = {"halo-max-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .description = "The maximum number of halo replicas; replicas"
+ " beyond this value will be written asynchronously"
+ "via the SHD."
+ },
+ { .key = {"halo-min-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "2",
+ .description = "The minimum number of halo replicas, before adding "
+ "out of region replicas."
+ },
+ { .key = {"halo-min-samples"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "3",
+ .description = "The minimum number of halo latency samples, before "
+ "we start forming the halos."
+ },
{ .key = {"heal-wait-queue-length"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
@@ -803,6 +967,13 @@ struct volume_options options[] = {
"translator is running as part of self-heal-daemon "
"or not."
},
+ { .key = {"iam-nfs-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of an NFS daemon "
+ "or not."
+ },
{ .key = {"quorum-type"},
.type = GF_OPTION_TYPE_STR,
.value = { "none", "auto", "fixed"},
@@ -865,9 +1036,13 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
+ { .key = {"gfid-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {"heal-timeout"},
.type = GF_OPTION_TYPE_INT,
- .min = 60,
+ .min = 5,
.max = INT_MAX,
.default_value = "600",
.description = "time interval for checking the need to self-heal "
@@ -933,5 +1108,9 @@ struct volume_options options[] = {
" with identical mtime and size in more than half the "
"number of bricks in the replica.",
},
+ { .key = {"pgfid-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 70c3e349743..b61f6f67460 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -30,6 +30,9 @@
#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
+#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */
+#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */
+#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */
#define AFR_LOCKEE_COUNT_MAX 3
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
@@ -48,6 +51,8 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
+#define AFR_ROOT_GFID "00000000-0000-0000-0000-000000000001"
+#define AFR_IS_ROOT_GFID(g) (strcmp (uuid_utoa(g), AFR_ROOT_GFID) == 0)
#define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;})
#define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
@@ -72,6 +77,17 @@ typedef enum {
AFR_FAV_CHILD_POLICY_MAX,
} afr_favorite_child_policy;
+struct afr_nfsd {
+ gf_boolean_t iamnfsd;
+ uint32_t halo_max_latency_msec;
+};
+
+struct afr_child {
+ uint32_t idx;
+ int64_t latency;
+ unsigned char child_up;
+};
+
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
@@ -83,6 +99,8 @@ typedef struct _afr_private {
inode_t *root_inode;
unsigned char *child_up;
+ int64_t *child_latency;
+ gf_boolean_t pgfid_self_heal;
unsigned char *local;
char **pending_key;
@@ -111,6 +129,7 @@ typedef struct _afr_private {
gf_boolean_t entry_change_log; /* on/off */
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ gf_boolean_t gfid_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
int favorite_child; /* subvolume to be preferred in resolving
@@ -148,13 +167,25 @@ typedef struct _afr_private {
uint32_t event_generation;
gf_boolean_t choose_local;
+ gf_boolean_t did_local_discovery;
gf_boolean_t did_discovery;
uint64_t sh_readdir_size;
gf_boolean_t ensure_durability;
char *sh_domain;
char *afr_dirty;
+ gf_boolean_t halo_enabled;
+
+ /* Halo geo-replication tunables */
+ gf_boolean_t halo_failover_enabled;
+ gf_boolean_t halo_hybrid_mode;
+ uint32_t halo_hybrid_read_max_latency_msec;
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
+ uint32_t halo_min_samples;
- afr_self_heald_t shd;
+ afr_self_heald_t shd;
+ struct afr_nfsd nfsd;
gf_boolean_t consistent_metadata;
uint64_t spb_choice_timeout;
@@ -787,6 +818,7 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
gf_boolean_t do_discovery;
+ gf_boolean_t do_local_discovery;
struct afr_reply *replies;
/* For client side background heals. */
@@ -795,6 +827,9 @@ typedef struct _afr_local {
gf_boolean_t need_full_crawl;
gf_boolean_t is_read_txn;
+ loc_t *unsplit_locs; /* Un-split targets */
+ uuid_t heal_pgfid; /* pgfid of file being healed */
+ char *heal_ancestry_path; /* Full path if avail */
} afr_local_t;
diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/cluster/aha/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am
new file mode 100644
index 00000000000..006db127d28
--- /dev/null
+++ b/xlators/cluster/aha/src/Makefile.am
@@ -0,0 +1,18 @@
+
+xlator_LTLIBRARIES = aha.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+aha_la_LDFLAGS = -module -avoid-version
+
+aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c
+aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c
new file mode 100644
index 00000000000..3b2ca641de2
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-fops.c
@@ -0,0 +1,952 @@
+#include "aha-fops.h"
+
+static void
+__save_fop (struct aha_fop *fop, struct aha_conf *conf)
+{
+ list_add_tail (&fop->list, &conf->failed);
+}
+
+void
+save_fop (struct aha_fop *fop, struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __save_fop (fop, conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...) \
+ do { \
+ struct aha_fop *fop = aha_fop_new (); \
+ if (!fop) { \
+ gf_log (GF_AHA, GF_LOG_CRITICAL, \
+ "Allocation failed, terminating " \
+ "to prevent a hung mount."); \
+ assert (0); \
+ } \
+ fop->stub = fop_##type##_stub (frame, aha_##type, \
+ args); \
+ fop->frame = frame; \
+ frame->local = fop; \
+ STACK_WIND (frame, cbk, obj, fn, args); \
+ } while (0) \
+
+/*
+ * AHA_HANDLE_FOP_CBK
+ *
+ * 1) If the error returned is ENOTCONN *and* the timer that waits
+ * for the server to come back has not expired, store the fop to retry later.
+ * 2) If the timer waiting for the server has expired, just unwind.
+ * 3) If the error returned is something other than ENOTCONN, just unwind.
+ *
+ */
+#define AHA_HANDLE_FOP_CBK(type, frame, args ...) \
+ do { \
+ struct aha_conf *conf = frame->this->private; \
+ struct aha_fop *fop = frame->local; \
+ if (op_ret != 0 && op_errno == ENOTCONN && \
+ !aha_is_timer_expired (conf)) { \
+ gf_log (GF_AHA, GF_LOG_WARNING, \
+ "Got ENOTCONN from client, storing " \
+ "to retry later!"); \
+ save_fop (fop, conf); \
+ } else { \
+ AHA_DESTROY_LOCAL (frame); \
+ STACK_UNWIND_STRICT (type, frame, args); \
+ } \
+ } while (0) \
+
+int
+aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode,
+ buf, xdata, postparent);
+ return 0;
+}
+
+
+int
+aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup,
+ loc, xdata);
+ return 0;
+}
+
+
+int
+aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, stat, aha_stat_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat,
+ loc, xdata);
+ return 0;
+}
+
+
+int
+aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop,
+ postop, xdata);
+ return 0;
+}
+
+
+int
+aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+}
+
+
+int
+aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop,
+ postop, xdata);
+ return 0;
+}
+
+int
+aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+}
+
+
+int
+aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+
+int
+aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+
+int
+aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t mask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, access, aha_access_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->access,
+ loc, mask, xdata);
+ return 0;
+}
+
+
+int
+aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *path, struct iatt *sbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno,
+ path, sbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ size_t size, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readlink,
+ loc, size, xdata);
+ return 0;
+}
+
+
+int
+aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno,
+ inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno,
+ inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int
+aha_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+}
+
+
+int
+aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
+}
+
+
+int
+aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf,
+ preoldparent, postoldparent,
+ prenewparent, postnewparent, xdata);
+ return 0;
+}
+
+
+int
+aha_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, rename, aha_rename_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+
+int
+aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, link, aha_link_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+
+int
+aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+
+int
+aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, create, aha_create_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, open, aha_open_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+}
+
+int
+aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata);
+ return 0;
+}
+
+int
+aha_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readv, aha_readv_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+int
+aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count,
+ off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, writev, aha_writev_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
+ fd, vector, count, off, flags, iobref, xdata);
+ return 0;
+}
+
+
+int
+aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, flush, aha_flush_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush,
+ fd, xdata);
+ return 0;
+}
+
+
+int
+aha_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ return 0;
+}
+
+
+int
+aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync,
+ fd, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+
+int
+aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fstat,
+ fd, xdata);
+ return 0;
+}
+
+
+int
+aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+}
+
+int
+aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsyncdir,
+ fd, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+
+int
+aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->statfs,
+ loc, xdata);
+ return 0;
+}
+
+
+
+int
+aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int
+aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+}
+
+
+int
+aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+
+int
+aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->xattrop,
+ loc, flags, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fxattrop,
+ fd, flags, dict, xdata);
+ return 0;
+}
+
+
+int
+aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int
+aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+
+int
+aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, lk, aha_lk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lk,
+ fd, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_inodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->inodelk,
+ volume, loc, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_finodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->finodelk,
+ volume, fd, cmd, lock, xdata);
+ return 0;
+}
+
+
+int
+aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_entrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->entrylk,
+ volume, loc, basename, cmd, type, xdata);
+ return 0;
+}
+
+
+int
+aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+
+int
+aha_fentrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fentrylk,
+ volume, fd, basename, cmd, type, xdata);
+ return 0;
+}
+
+int
+aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+
+int
+aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t off, dict_t *xdata)
+{
+ AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readdir,
+ fd, size, off, xdata);
+ return 0;
+}
+
+
+int
+aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+
+int
+aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readdirp,
+ fd, size, off, dict);
+ return 0;
+}
diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h
new file mode 100644
index 00000000000..b1fb9d38a80
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-fops.h
@@ -0,0 +1,360 @@
+#ifndef _AHA_FOPS_H
+#define _AHA_FOPS_H
+
+#include "aha.h"
+#include "aha-helpers.h"
+
+/* FOP functions */
+int
+aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+int
+aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+int
+aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata);
+
+int
+aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata);
+
+int
+aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
+
+int
+aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
+
+int
+aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata);
+
+int
+aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
+
+int
+aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
+
+int
+aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
+
+int
+aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata);
+
+int
+aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int
+aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int
+aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+int
+aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+int
+aha_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata);
+
+int
+aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+
+int
+aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int
+aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata);
+
+int
+aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int
+aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
+
+int
+aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata);
+
+int
+aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int
+aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+
+int
+aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+
+int
+aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int
+aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int
+aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+
+int
+aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int
+aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata);
+
+int
+aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, int32_t cmd, struct gf_flock *lock,
+ dict_t *xdata);
+
+int
+aha_finodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, int32_t cmd,
+ struct gf_flock *lock, dict_t *xdata);
+
+int
+aha_entrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+
+int
+aha_fentrylk (call_frame_t *frame, xlator_t *this,
+ const char *volume, fd_t *fd, const char *basename,
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+int
+aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata);
+
+int
+aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict);
+
+/* Callback functions */
+
+int
+aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent);
+
+int
+aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata);
+
+int
+aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata);
+
+int
+aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata);
+
+int
+aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+
+int
+aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+
+
+int
+aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+
+int
+aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ const char *path, struct iatt *sbuf, dict_t *xdata);
+
+
+int
+aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+
+int
+aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+int
+aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+
+int
+aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+int
+aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+int
+aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata);
+
+int
+aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+int
+aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata);
+int
+aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata);
+int
+aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref, dict_t *xdata);
+
+int
+aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata);
+int
+aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_fsync_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+int
+aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata);
+
+int
+aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata);
+int
+aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata);
+int
+aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int
+aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata);
+
+int
+aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
+int
+aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
+
+#endif /* _AHA_FOPS_H */
diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c
new file mode 100644
index 00000000000..e3b713688d3
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-helpers.c
@@ -0,0 +1,46 @@
+#include "aha-helpers.h"
+
+struct aha_conf *aha_conf_new ()
+{
+ struct aha_conf *conf = NULL;
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf);
+ if (!conf)
+ goto err;
+
+ INIT_LIST_HEAD (&conf->failed);
+
+ LOCK_INIT (&conf->lock);
+err:
+ return conf;
+}
+
+void aha_conf_destroy (struct aha_conf *conf)
+{
+ LOCK_DESTROY (&conf->lock);
+ GF_FREE (conf);
+}
+
+struct aha_fop *aha_fop_new ()
+{
+ struct aha_fop *fop = NULL;
+
+ fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop);
+ if (!fop)
+ goto err;
+
+ INIT_LIST_HEAD (&fop->list);
+
+err:
+ return fop;
+}
+
+void aha_fop_destroy (struct aha_fop *fop)
+{
+ if (!fop)
+ return;
+
+ call_stub_destroy (fop->stub);
+ fop->stub = NULL;
+ GF_FREE (fop);
+}
diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h
new file mode 100644
index 00000000000..d9cf9b3295d
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-helpers.h
@@ -0,0 +1,23 @@
+#ifndef _AHA_HELPERS_H
+#define _AHA_HELPERS_H
+
+#include "aha.h"
+
+#define GF_AHA "aha"
+
+struct aha_conf *aha_conf_new ();
+
+void aha_conf_destroy (struct aha_conf *conf);
+
+struct aha_fop *aha_fop_new ();
+
+void aha_fop_destroy (struct aha_fop *fop);
+
+#define AHA_DESTROY_LOCAL(frame) \
+ do { \
+ struct aha_fop *fop = frame->local; \
+ aha_fop_destroy (fop); \
+ frame->local = NULL; \
+ } while (0) \
+
+#endif /* _AHA_HELPERS_H */
diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h
new file mode 100644
index 00000000000..117dda27e8b
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-mem-types.h
@@ -0,0 +1,22 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __AHA_MEM_TYPES_H__
+#define __AHA_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_aha_mem_types_ {
+ gf_aha_mt_begin_t = gf_common_mt_end + 1,
+ gf_aha_mt_conf,
+ gf_aha_mt_fop,
+ gf_aha_mt_end
+};
+#endif
diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c
new file mode 100644
index 00000000000..8810f913f42
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-retry.c
@@ -0,0 +1,524 @@
+#include "aha.h"
+#include "aha-helpers.h"
+#include "aha-retry.h"
+#include "aha-fops.h"
+
+/*
+ * AHA_RETRY_FOP:
+ *
+ * - We STACK_WIND the fop using the arguments in the call_stub.
+ * We use STACK_WIND because we need a *new* frame, since we already
+ * exhausted the existing frame with the original STACK_WIND.
+ *
+ * - After STACK_WIND completes, we can destroy this frame's local (which
+ * should be struct aha_fop *). The frame itself will get destroyed higher in
+ * the xlator graph, since its still part of the call stack.
+ */
+#define AHA_RETRY_FOP(fop, type, args ...) \
+ do { \
+ call_stub_t *stub = fop->stub; \
+ call_frame_t *frame = fop->frame; \
+ xlator_t *this = frame->this; \
+ STACK_WIND (frame, aha_##type##_cbk, this, \
+ this->fops->type, args); \
+ AHA_DESTROY_LOCAL (frame); \
+ } while (0) \
+
+#define AHA_UNWIND_FOP(fop, type) \
+ do { \
+ call_frame_t *frame = fop->frame; \
+ AHA_DESTROY_LOCAL (frame); \
+ default_##type##_failure_cbk (frame, ETIMEDOUT); \
+ } while (0) \
+
+void
+__aha_retry_force_unwind_fops (struct aha_conf *conf)
+{
+ struct aha_fop *fop = NULL;
+ struct aha_fop *tmp = NULL;
+ size_t ndrained = 0;
+
+ /*
+ * Drain the queue. After we finish the loop, the list
+ * must be empty.
+ */
+ list_for_each_entry_safe (fop, tmp, &conf->failed, list) {
+ list_del (&fop->list);
+ aha_force_unwind_fop (fop);
+ ndrained++;
+ }
+
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Force-unwound %"GF_PRI_SIZET" fops!", ndrained);
+
+ assert (list_empty (&conf->failed));
+}
+
+void
+aha_force_unwind_fops (struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __aha_retry_force_unwind_fops (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+void
+__aha_retry_failed_fops (struct aha_conf *conf)
+{
+ struct aha_fop *fop = NULL;
+ struct aha_fop *tmp = NULL;
+ size_t ndrained = 0;
+
+ /*
+ * Skip if the child is not up
+ */
+ if (!conf->child_up) {
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Waiting for child to come up before retrying.");
+ return;
+ }
+
+ /*
+ * Skip if the the queue is empty.
+ */
+ if (list_empty (&conf->failed)) {
+ gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry.");
+ }
+
+ /*
+ * Drain the queue. After we finish the loop, the list
+ * must be empty.
+ */
+ list_for_each_entry_safe (fop, tmp, &conf->failed, list) {
+ list_del (&fop->list);
+ aha_retry_fop (fop);
+ ndrained++;
+ }
+
+ gf_log (GF_AHA, GF_LOG_WARNING,
+ "Drained %"GF_PRI_SIZET" fops!", ndrained);
+
+ assert (list_empty (&conf->failed));
+}
+
+
+void
+aha_retry_failed_fops (struct aha_conf *conf)
+{
+ LOCK (&conf->lock);
+ {
+ __aha_retry_failed_fops (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+void aha_retry_fop (struct aha_fop *fop)
+{
+ call_stub_t *stub = fop->stub;
+
+ switch (stub->fop) {
+ case GF_FOP_OPEN:
+ AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags,
+ stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_CREATE:
+ AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags,
+ stub->args.mode, stub->args.umask,
+ stub->args.fd,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_STAT:
+ AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_READLINK:
+ AHA_RETRY_FOP (fop, readlink, &stub->args.loc,
+ stub->args.size, stub->args.xdata);
+ break;
+
+ case GF_FOP_MKNOD:
+ AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode,
+ stub->args.rdev, stub->args.umask,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_MKDIR:
+ AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode,
+ stub->args.umask, stub->args.xdata);
+ break;
+
+ case GF_FOP_UNLINK:
+ AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_RMDIR:
+ AHA_RETRY_FOP (fop, rmdir, &stub->args.loc,
+ stub->args.flags, stub->args.xdata);
+ break;
+
+ case GF_FOP_SYMLINK:
+ AHA_RETRY_FOP (fop, symlink, stub->args.linkname,
+ &stub->args.loc, stub->args.umask,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_RENAME:
+ AHA_RETRY_FOP (fop, rename, &stub->args.loc,
+ &stub->args.loc2, stub->args.xdata);
+ break;
+
+ case GF_FOP_LINK:
+ AHA_RETRY_FOP (fop, link, &stub->args.loc,
+ &stub->args.loc2, stub->args.xdata);
+ break;
+
+ case GF_FOP_TRUNCATE:
+ AHA_RETRY_FOP (fop, truncate, &stub->args.loc,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_READ:
+ AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.flags,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_WRITE:
+ AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector,
+ stub->args.count, stub->args.offset,
+ stub->args.flags, stub->args.iobref,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_STATFS:
+ AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_FLUSH:
+ AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSYNC:
+ AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_SETXATTR:
+ AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr,
+ stub->args.flags, stub->args.xdata);
+ break;
+
+ case GF_FOP_GETXATTR:
+ AHA_RETRY_FOP (fop, getxattr, &stub->args.loc,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd,
+ stub->args.xattr, stub->args.flags,
+ stub->args.xdata);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_REMOVEXATTR:
+ AHA_RETRY_FOP (fop, removexattr, &stub->args.loc,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd,
+ stub->args.name, stub->args.xdata);
+ break;
+
+ case GF_FOP_OPENDIR:
+ AHA_RETRY_FOP (fop, opendir, &stub->args.loc,
+ stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSYNCDIR:
+ AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd,
+ stub->args.datasync, stub->args.xdata);
+ break;
+
+ case GF_FOP_ACCESS:
+ AHA_RETRY_FOP (fop, access, &stub->args.loc,
+ stub->args.mask, stub->args.xdata);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ AHA_RETRY_FOP (fop, ftruncate, stub->args.fd,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSTAT:
+ AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata);
+ break;
+
+ case GF_FOP_LK:
+ AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_INODELK:
+ AHA_RETRY_FOP (fop, inodelk, stub->args.volume,
+ &stub->args.loc, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_FINODELK:
+ AHA_RETRY_FOP (fop, finodelk, stub->args.volume,
+ stub->args.fd, stub->args.cmd,
+ &stub->args.lock, stub->args.xdata);
+ break;
+
+ case GF_FOP_ENTRYLK:
+ AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc,
+ stub->args.name, stub->args.entrylkcmd,
+ stub->args.entrylktype, stub->args.xdata);
+ break;
+
+ case GF_FOP_FENTRYLK:
+ AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd,
+ stub->args.name, stub->args.entrylkcmd,
+ stub->args.entrylktype, stub->args.xdata);
+ break;
+
+ case GF_FOP_LOOKUP:
+ AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata);
+ break;
+
+ case GF_FOP_READDIR:
+ AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_READDIRP:
+ AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size,
+ stub->args.offset, stub->args.xdata);
+ break;
+
+ case GF_FOP_XATTROP:
+ AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype,
+ stub->args.xattr, stub->args.xdata);
+ break;
+
+ case GF_FOP_FXATTROP:
+ AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype,
+ stub->args.xattr, stub->args.xdata);
+ break;
+
+ case GF_FOP_SETATTR:
+ AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat,
+ stub->args.valid, stub->args.xdata);
+ break;
+
+ case GF_FOP_FSETATTR:
+ AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat,
+ stub->args.valid, stub->args.xdata);
+ break;
+
+ default:
+ /* Some fops are not implemented yet:
+ *
+ * GF_FOP_NULL
+ * GF_FOP_RCHECKSUM
+ * GF_FOP_FORGET
+ * GF_FOP_RELEASE
+ * GF_FOP_RELEASEDIR
+ * GF_FOP_GETSPEC
+ * GF_FOP_FALLOCATE
+ * GF_FOP_DISCARD
+ * GF_FOP_ZEROFILL
+ * GF_FOP_MAXVALUE
+ *
+ */
+ gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s",
+ gf_fop_list[stub->fop]);
+ assert (0);
+ break;
+ }
+}
+
+void
+aha_force_unwind_fop (struct aha_fop *fop)
+{
+ call_stub_t *stub = fop->stub;
+
+ switch (stub->fop) {
+ case GF_FOP_OPEN:
+ AHA_UNWIND_FOP (fop, open);
+ break;
+
+ case GF_FOP_CREATE:
+ AHA_UNWIND_FOP (fop, create);
+ break;
+
+ case GF_FOP_STAT:
+ AHA_UNWIND_FOP (fop, stat);
+ break;
+
+ case GF_FOP_READLINK:
+ AHA_UNWIND_FOP (fop, readlink);
+ break;
+
+ case GF_FOP_MKNOD:
+ AHA_UNWIND_FOP (fop, mknod);
+ break;
+
+ case GF_FOP_MKDIR:
+ AHA_UNWIND_FOP (fop, mkdir);
+ break;
+
+ case GF_FOP_UNLINK:
+ AHA_UNWIND_FOP (fop, unlink);
+ break;
+
+ case GF_FOP_RMDIR:
+ AHA_UNWIND_FOP (fop, rmdir);
+ break;
+
+ case GF_FOP_SYMLINK:
+ AHA_UNWIND_FOP (fop, symlink);
+ break;
+
+ case GF_FOP_RENAME:
+ AHA_UNWIND_FOP (fop, rename);
+ break;
+
+ case GF_FOP_LINK:
+ AHA_UNWIND_FOP (fop, link);
+ break;
+
+ case GF_FOP_TRUNCATE:
+ AHA_UNWIND_FOP (fop, truncate);
+ break;
+
+ case GF_FOP_READ:
+ AHA_UNWIND_FOP (fop, readv);
+ break;
+
+ case GF_FOP_WRITE:
+ AHA_UNWIND_FOP (fop, writev);
+ break;
+
+ case GF_FOP_STATFS:
+ AHA_UNWIND_FOP (fop, statfs);
+ break;
+
+ case GF_FOP_FLUSH:
+ AHA_UNWIND_FOP (fop, flush);
+ break;
+
+ case GF_FOP_FSYNC:
+ AHA_UNWIND_FOP (fop, fsync);
+ break;
+
+ case GF_FOP_SETXATTR:
+ AHA_UNWIND_FOP (fop, setxattr);
+ break;
+
+ case GF_FOP_GETXATTR:
+ AHA_UNWIND_FOP (fop, getxattr);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ AHA_UNWIND_FOP (fop, fsetxattr);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ AHA_UNWIND_FOP (fop, fgetxattr);
+ break;
+
+ case GF_FOP_REMOVEXATTR:
+ AHA_UNWIND_FOP (fop, removexattr);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ AHA_UNWIND_FOP (fop, fremovexattr);
+ break;
+
+ case GF_FOP_OPENDIR:
+ AHA_UNWIND_FOP (fop, opendir);
+ break;
+
+ case GF_FOP_FSYNCDIR:
+ AHA_UNWIND_FOP (fop, fsyncdir);
+ break;
+
+ case GF_FOP_ACCESS:
+ AHA_UNWIND_FOP (fop, access);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ AHA_UNWIND_FOP (fop, ftruncate);
+ break;
+
+ case GF_FOP_FSTAT:
+ AHA_UNWIND_FOP (fop, fstat);
+ break;
+
+ case GF_FOP_LK:
+ AHA_UNWIND_FOP (fop, lk);
+ break;
+
+ case GF_FOP_INODELK:
+ AHA_UNWIND_FOP (fop, inodelk);
+ break;
+
+ case GF_FOP_FINODELK:
+ AHA_UNWIND_FOP (fop, finodelk);
+ break;
+
+ case GF_FOP_ENTRYLK:
+ AHA_UNWIND_FOP (fop, entrylk);
+ break;
+
+ case GF_FOP_FENTRYLK:
+ AHA_UNWIND_FOP (fop, fentrylk);
+ break;
+
+ case GF_FOP_LOOKUP:
+ AHA_UNWIND_FOP (fop, lookup);
+ break;
+
+ case GF_FOP_READDIR:
+ AHA_UNWIND_FOP (fop, readdir);
+ break;
+
+ case GF_FOP_READDIRP:
+ AHA_UNWIND_FOP (fop, readdirp);
+ break;
+
+ case GF_FOP_XATTROP:
+ AHA_UNWIND_FOP (fop, xattrop);
+ break;
+
+ case GF_FOP_FXATTROP:
+ AHA_UNWIND_FOP (fop, fxattrop);
+ break;
+
+ case GF_FOP_SETATTR:
+ AHA_UNWIND_FOP (fop, setattr);
+ break;
+
+ case GF_FOP_FSETATTR:
+ AHA_UNWIND_FOP (fop, fsetattr);
+ break;
+
+ default:
+ /* Some fops are not implemented yet,
+ * and this would never happen cause we wouldn't
+ * queue them (see the assert statement in aha_retry_fop())
+ */
+ break;
+ }
+}
diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h
new file mode 100644
index 00000000000..5c8f56bca97
--- /dev/null
+++ b/xlators/cluster/aha/src/aha-retry.h
@@ -0,0 +1,12 @@
+#ifndef _AHA_RETRY_H
+#define _AHA_RETRY_H
+
+void aha_retry_failed_fops (struct aha_conf *conf);
+
+void aha_retry_fop (struct aha_fop *fop);
+
+void aha_force_unwind_fops (struct aha_conf *conf);
+
+void aha_force_unwind_fop (struct aha_fop *fop);
+
+#endif /* _AHA_RETRY_H */
diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c
new file mode 100644
index 00000000000..5160f1091d4
--- /dev/null
+++ b/xlators/cluster/aha/src/aha.c
@@ -0,0 +1,345 @@
+#include "aha-helpers.h"
+#include "aha-retry.h"
+#include "aha-fops.h"
+#include "aha.h"
+
+#include "syncop.h"
+
+
+int
+retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg)
+{
+ /* Nothing to do here ... */
+ return 0;
+}
+
+int
+retry_failed_fops (void *arg)
+{
+ xlator_t *this = NULL;
+
+ struct aha_conf *conf = NULL;
+
+ this = arg;
+ conf = this->private;
+
+ aha_retry_failed_fops (conf);
+
+ return 0;
+}
+
+void
+dispatch_fop_queue_drain (xlator_t *this)
+{
+ struct syncenv *env = NULL;
+ int ret = 0;
+
+ env = this->ctx->env;
+
+ ret = synctask_new (env, retry_failed_fops,
+ retry_failed_fops_cbk, NULL, this);
+ if (ret != 0) {
+ gf_log (GF_AHA, GF_LOG_CRITICAL,
+ "Failed to dispatch synctask "
+ "to drain fop queue!");
+ }
+}
+
+extern inline void
+__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired)
+{
+ conf->timer_expired = expired;
+}
+
+extern inline gf_boolean_t
+__aha_is_timer_expired (struct aha_conf *conf)
+{
+ return conf->timer_expired;
+}
+
+gf_boolean_t
+aha_is_timer_expired (struct aha_conf *conf)
+{
+ gf_boolean_t expired = _gf_false;
+
+ LOCK (&conf->lock);
+ {
+ expired = __aha_is_timer_expired (conf);
+ }
+ UNLOCK (&conf->lock);
+
+ return expired;
+}
+
+void
+aha_child_down_timer_expired (void *data)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = data;
+
+ gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!");
+
+ LOCK (&conf->lock);
+ {
+ __aha_set_timer_status (conf, _gf_true);
+ }
+ UNLOCK (&conf->lock);
+
+ aha_force_unwind_fops ((struct aha_conf *)data);
+}
+
+void
+__aha_start_timer (struct aha_conf *conf)
+{
+ struct timespec child_down_timeout = {
+ .tv_sec = conf->server_wait_timeout,
+ .tv_nsec = 0
+ };
+
+ __aha_set_timer_status (conf, _gf_false);
+
+ conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout,
+ aha_child_down_timer_expired, conf);
+ if (!conf->timer) {
+ gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!");
+ }
+
+ gf_log (GF_AHA, GF_LOG_INFO,
+ "Registered timer for %lu seconds.",
+ conf->server_wait_timeout);
+}
+
+void
+__aha_cancel_timer (struct aha_conf *conf)
+{
+ if (!conf->timer)
+ goto out;
+
+ gf_timer_call_cancel (conf->this->ctx, conf->timer);
+ conf->timer = NULL;
+ gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!");
+out:
+ return;
+}
+
+void
+__aha_update_child_status (struct aha_conf *conf, int status)
+{
+ conf->child_up = status;
+}
+
+void
+aha_handle_child_up (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ LOCK (&conf->lock);
+ {
+ __aha_update_child_status (
+ conf, AHA_CHILD_STATUS_UP); /* Mark the child as up */
+ __aha_set_timer_status (
+ conf, _gf_false); /* Timer is no longer expired */
+ __aha_cancel_timer (conf); /* Cancel the timer */
+ }
+ UNLOCK (&conf->lock);
+}
+
+void
+aha_handle_child_down (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ LOCK (&conf->lock);
+ {
+ __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN);
+ __aha_set_timer_status (conf, _gf_true);
+ __aha_start_timer (conf);
+ }
+ UNLOCK (&conf->lock);
+}
+
+int32_t
+notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ switch (event) {
+ case GF_EVENT_CHILD_DOWN:
+ gf_log (this->name, GF_LOG_WARNING, "Got child-down event!");
+ aha_handle_child_down (this);
+ break;
+ case GF_EVENT_CHILD_UP:
+ gf_log (this->name, GF_LOG_WARNING, "Got child-up event!");
+ aha_handle_child_up (this);
+ dispatch_fop_queue_drain (this);
+ break;
+ default:
+ break;
+ }
+
+ default_notify (this, event, data);
+
+ return 0;
+}
+
+int32_t
+aha_priv_dump (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init failed!");
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = this->private;
+
+ GF_OPTION_RECONF ("server-wait-timeout-seconds",
+ conf->server_wait_timeout,
+ options, size_uint64, err);
+
+ return 0;
+err:
+ return -1;
+}
+
+int
+aha_init_options (xlator_t *this)
+{
+ struct aha_conf *conf = NULL;
+
+ conf = this->private;
+
+ GF_OPTION_INIT ("server-wait-timeout-seconds",
+ conf->server_wait_timeout,
+ size_uint64, err);
+
+ return 0;
+err:
+ return -1;
+}
+
+
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ struct aha_conf *conf = NULL;
+
+ conf = aha_conf_new ();
+ if (!conf) {
+ ret = -(ENOMEM);
+ goto err;
+ }
+
+ conf->this = this;
+ this->private = conf;
+
+ aha_init_options (this);
+
+ /* init() completed successfully */
+ goto done;
+err:
+ gf_log (GF_AHA, GF_LOG_ERROR,
+ "init() failed, please see "
+ "logs for details.");
+
+ /* Free all allocated memory */
+ aha_conf_destroy (conf);
+done:
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ struct aha_conf *conf = this->private;
+
+ aha_conf_destroy (conf);
+
+ this->private = NULL;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = aha_priv_dump,
+};
+
+struct xlator_fops cbks;
+
+struct xlator_fops fops = {
+ .lookup = aha_lookup,
+ .stat = aha_stat,
+ .readlink = aha_readlink,
+ .mknod = aha_mknod,
+ .mkdir = aha_mkdir,
+ .unlink = aha_unlink,
+ .rmdir = aha_rmdir,
+ .symlink = aha_symlink,
+ .rename = aha_rename,
+ .link = aha_link,
+ .truncate = aha_truncate,
+ .create = aha_create,
+ .open = aha_open,
+ .readv = aha_readv,
+ .writev = aha_writev,
+ .statfs = aha_statfs,
+ .flush = aha_flush,
+ .fsync = aha_fsync,
+ .setxattr = aha_setxattr,
+ .getxattr = aha_getxattr,
+ .removexattr = aha_removexattr,
+ .fsetxattr = aha_fsetxattr,
+ .fgetxattr = aha_fgetxattr,
+ .fremovexattr = aha_fremovexattr,
+ .opendir = aha_opendir,
+ .readdir = aha_readdir,
+ .readdirp = aha_readdirp,
+ .fsyncdir = aha_fsyncdir,
+ .access = aha_access,
+ .ftruncate = aha_ftruncate,
+ .fstat = aha_fstat,
+ .lk = aha_lk,
+ .lookup_cbk = aha_lookup_cbk,
+ .xattrop = aha_xattrop,
+ .fxattrop = aha_fxattrop,
+ .inodelk = aha_inodelk,
+ .finodelk = aha_finodelk,
+ .entrylk = aha_entrylk,
+ .fentrylk = aha_fentrylk,
+ .setattr = aha_setattr,
+ .fsetattr = aha_fsetattr,
+};
+
+struct volume_options options[] = {
+ { .key = {"server-wait-timeout-seconds"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 10,
+ .max = 20 * 60,
+ .default_value = TOSTRING (120),
+ .description = "Specifies the number of seconds the "
+ "AHA translator will wait "
+ "for a CHILD_UP event before "
+ "force-unwinding the frames it has "
+ "currently stored for retry."
+ },
+ { .key = {NULL} }
+};
diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h
new file mode 100644
index 00000000000..3dbf3199776
--- /dev/null
+++ b/xlators/cluster/aha/src/aha.h
@@ -0,0 +1,46 @@
+#ifndef _AHA_H
+#define _AHA_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "statedump.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "list.h"
+#include "timer.h"
+
+#include "aha-mem-types.h"
+
+/* new() and destroy() functions for all structs can be found in
+ * aha-helpers.c
+ */
+struct aha_conf {
+ xlator_t *this;
+ uint8_t child_up;
+ gf_lock_t lock;
+ struct list_head failed;
+ gf_timer_t *timer;
+ gf_boolean_t timer_expired;
+ uint64_t server_wait_timeout;
+};
+
+struct aha_fop {
+ call_stub_t *stub; /* Only used to store function arguments */
+ call_frame_t *frame; /* Frame corresponding to this fop */
+ uint64_t tries;
+ struct list_head list;
+};
+
+enum {
+ AHA_CHILD_STATUS_DOWN = 0,
+ AHA_CHILD_STATUS_UP = 1,
+ AHA_CHILD_STATUS_MAX
+};
+
+gf_boolean_t aha_is_timer_expired (struct aha_conf *conf);
+
+#endif
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index c4586c2f9b1..cd35080e243 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -3463,6 +3463,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
int cnt = 0;
char *node_uuid_key = NULL;
int ret = -1;
+
+ GF_CHECK_XATTR_KEY_AND_GOTO (key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
@@ -5553,6 +5555,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
{
dht_local_t *local = NULL;
xlator_t *avail_subvol = NULL;
+ int op_errno = 0;
local = frame->local;
@@ -5565,9 +5568,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
subvol, subvol->fops->mknod, loc, mode,
rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
-
- if (avail_subvol != subvol) {
+ /* This will return NULL if all subvolumes are full
+ * and/or no subvolume needs the min_free_disk limit
+ */
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->rdev = rdev;
local->mode = mode;
@@ -5597,6 +5606,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
}
out:
return 0;
+err:
+ return op_errno;
}
int32_t
@@ -6165,7 +6176,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
gf_msg_debug (this->name, 0,
"no subvolume in layout for path=%s",
loc->path);
- op_errno = EIO;
+ op_errno = NO_SUBVOL_HASH_ERRNO;
goto err;
}
@@ -6236,8 +6247,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
}
}
- dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode,
- umask, params);
+ op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc,
+ rdev, mode, umask,
+ params);
+ if (op_errno != 0) {
+ goto err;
+ }
done:
return 0;
@@ -6571,7 +6586,7 @@ dht_link (call_frame_t *frame, xlator_t *this,
gf_msg_debug (this->name, 0,
"no subvolume in layout for path=%s",
newloc->path);
- op_errno = EIO;
+ op_errno = NO_SUBVOL_HASH_ERRNO;
goto err;
}
@@ -6734,6 +6749,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
{
dht_local_t *local = NULL;
xlator_t *avail_subvol = NULL;
+ int op_errno = 0;
local = frame->local;
@@ -6748,8 +6764,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
} else {
avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
-
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
local->mode = mode;
@@ -6776,6 +6794,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
}
out:
return 0;
+err:
+ DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return op_errno;
}
int
@@ -6878,9 +6900,10 @@ dht_create_do (call_frame_t *frame)
goto err;
}
- dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc,
- local->flags, local->mode,
- local->umask, local->fd, local->params);
+ dht_create_wind_to_avail_subvol (frame, this, subvol,
+ &local->loc, local->flags,
+ local->mode, local->umask,
+ local->fd, local->params);
return 0;
err:
local->refresh_layout_unlock (frame, this, -1, 1);
@@ -7067,7 +7090,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
"no subvolume in layout for path=%s",
loc->path);
- op_errno = EIO;
+ op_errno = NO_SUBVOL_HASH_ERRNO;
goto err;
}
@@ -7590,7 +7613,7 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,
gf_msg_debug (this->name, 0,
"hashed subvol not found for %s",
loc->path);
- local->op_errno = EIO;
+ local->op_errno = NO_SUBVOL_HASH_ERRNO;
goto err;
}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 9e9ca712417..fa973f294fb 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -300,6 +300,7 @@ struct dht_du {
uint64_t avail_space;
uint32_t log;
uint32_t chunks;
+ gf_boolean_t is_full;
};
typedef struct dht_du dht_du_t;
@@ -484,6 +485,7 @@ struct dht_conf {
dht_du_t *du_stats;
double min_free_disk;
double min_free_inodes;
+ gf_boolean_t min_free_strict_mode;
char disk_unit;
int32_t refresh_interval;
gf_boolean_t unhashed_sticky_bit;
@@ -549,6 +551,10 @@ struct dht_conf {
gf_boolean_t lock_migration_enabled;
gf_lock_t lock;
+
+ /* du stats */
+ uint32_t du_refresh_interval_sec;
+ gf_lock_t du_refresh_lock;
};
typedef struct dht_conf dht_conf_t;
@@ -603,6 +609,8 @@ typedef struct dht_fd_ctx {
} dht_fd_ctx_t;
+#define NO_SUBVOL_HASH_ERRNO EROFS
+
#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0)
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 1eb9e63c531..1b20dabc61f 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
call_frame_t *statfs_frame = NULL;
dht_local_t *statfs_local = NULL;
struct timeval tv = {0,};
+ struct timeval cmp_tv = {0,};
loc_t tmp_loc = {0,};
conf = this->private;
+ /* Somebody else is already refreshing the statfs info */
+ if (TRY_LOCK (&conf->du_refresh_lock) != 0)
+ return 0;
+
gettimeofday (&tv, NULL);
+ cmp_tv = conf->last_stat_fetch;
+ cmp_tv.tv_sec += conf->du_refresh_interval_sec;
+
/* make it root gfid, should be enough to get the proper
info back */
tmp_loc.gfid[15] = 1;
- if (tv.tv_sec > (conf->refresh_interval
- + conf->last_stat_fetch.tv_sec)) {
-
+ if (timercmp (&tv, &cmp_tv, >)) {
statfs_frame = copy_frame (frame);
if (!statfs_frame) {
goto err;
@@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
&tmp_loc, statfs_local->params);
}
- conf->last_stat_fetch.tv_sec = tv.tv_sec;
+ conf->last_stat_fetch = tv;
}
- return 0;
+ ret = 0;
+ goto out;
err:
if (statfs_frame)
DHT_STACK_DESTROY (statfs_frame);
- return -1;
+ ret = -1;
+out:
+ UNLOCK (&conf->du_refresh_lock);
+ return ret;
}
@@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
conf = this->private;
/* Check for values above specified percent or free disk */
- LOCK (&conf->subvolume_lock);
- {
+ if (TRY_LOCK (&conf->subvolume_lock) != 0) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ return conf->du_stats[i].is_full;
+ }
+ }
+ } else {
for (i = 0; i < conf->subvolume_cnt; i++) {
if (subvol == conf->subvolumes[i]) {
if (conf->disk_unit == 'p') {
@@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
}
}
}
- }
+
+ /* i will be less than subvolume_cnt if either of
+ * these booleans are true */
+ is_subvol_filled = (
+ subvol_filled_space || subvol_filled_inodes);
+ if (is_subvol_filled) {
+ conf->du_stats[i].is_full = is_subvol_filled;
+ }
+ }
UNLOCK (&conf->subvolume_lock);
if (subvol_filled_space && conf->subvolume_status[i]) {
@@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
}
}
- is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
-
return is_subvol_filled;
}
@@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,
layout);
- if(!avail_subvol)
- {
- avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol,
- layout);
- }
-
}
UNLOCK (&conf->subvolume_lock);
out:
@@ -325,7 +339,6 @@ out:
gf_msg_debug (this->name, 0,
"No subvolume has enough free space \
and/or inodes to create");
- avail_subvol = subvol;
}
if (layout)
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index 298eca711b4..05f71fbcc86 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
int op_errno = -1;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ conf = this->private;
+
+ if (conf->min_free_strict_mode == _gf_true)
+ dht_get_du_info (frame, this, loc);
local = dht_local_init (frame, loc, fd, GF_FOP_OPEN);
if (!local) {
@@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this,
"no cached subvolume for fd=%p", fd);
op_errno = EINVAL;
goto err;
+ } else if (conf->min_free_strict_mode == _gf_true &&
+ dht_is_subvol_filled (this, subvol) == _gf_true &&
+ flags & O_APPEND) {
+ op_errno = ENOSPC;
+ goto err;
}
if (xdata)
local->xattr_req = dict_ref (xdata);
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index 364b66c942e..48d49dd3475 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
xlator_t *subvol = NULL;
int op_errno = -1;
dht_local_t *local = NULL;
+ loc_t *nil_loc = {0,};
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ conf = this->private;
+
+
local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE);
if (!local) {
@@ -173,12 +178,19 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto err;
}
+ if (conf->min_free_strict_mode == _gf_true)
+ dht_get_du_info (frame, this, nil_loc);
+
subvol = local->cached_subvol;
if (!subvol) {
gf_msg_debug (this->name, 0,
"no cached subvolume for fd=%p", fd);
op_errno = EINVAL;
goto err;
+ } else if (conf->min_free_strict_mode == _gf_true &&
+ dht_is_subvol_filled (this, subvol) == _gf_true) {
+ op_errno = ENOSPC;
+ goto err;
}
if (xdata)
local->xattr_req = dict_ref (xdata);
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index a478f06b2a9..dc0b7dd619e 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -20,7 +20,7 @@
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
-#define DHT_REBALANCE_BLKSIZE (128 * 1024)
+#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
#define MAX_MIGRATE_QUEUE_COUNT 500
#define MIN_MIGRATE_QUEUE_COUNT 200
@@ -1328,14 +1328,25 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
&dst_fd, xattr);
- if (ret)
- goto out;
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: "
+ "failed to create dest file on %s",
+ loc->path, to->name);
+ goto out;
+ }
clean_dst = _gf_true;
ret = __dht_check_free_space (to, from, loc, &stbuf, flag);
if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: "
+ "Disk space check failed on %s",
+ loc->path, to->name);
goto out;
}
@@ -1345,7 +1356,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: failed to open %s on %s",
+ "Migrate file failed: %s: failed to open on %s",
loc->path, from->name);
goto out;
}
@@ -1360,7 +1371,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, -ret,
DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:failed to lookup %s on %s ",
+ "Migrate file failed: %s: failed to lookup %s ",
loc->path, from->name);
ret = -1;
goto out;
@@ -2427,6 +2438,9 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
goto out;
}
+ gf_uuid_copy (entry_loc.inode->gfid,
+ df_entry->d_stat.ia_gfid);
+
if (gf_uuid_is_null (df_entry->d_stat.ia_gfid)) {
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_GFID_NULL,
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 5c810f0dc77..ccbf66b626d 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options)
conf->disk_unit = 0;
if (conf->min_free_disk < 100.0)
conf->disk_unit = 'p';
+ GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode,
+ options, bool, out);
GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
percent, out);
@@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options,
bool, out);
+
+ GF_OPTION_RECONF ("du-refresh-interval-sec",
+ conf->du_refresh_interval_sec, options, uint32, out);
ret = 0;
out:
return ret;
@@ -720,7 +725,10 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
- err);
+ err);
+
+ GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode,
+ bool, err);
GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
err);
@@ -738,6 +746,11 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled,
bool, err);
+ GF_OPTION_INIT ("du-refresh-interval-sec",
+ conf->du_refresh_interval_sec, uint32, err);
+
+ LOCK_INIT (&conf->du_refresh_lock);
+
if (defrag) {
defrag->lock_migration_enabled = conf->lock_migration_enabled;
@@ -907,6 +920,14 @@ struct volume_options options[] = {
"process starts balancing out the cluster, and logs will appear "
"in log files",
},
+ { .key = {"min-free-strict-mode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "When enabled, will reject in-flight writes or "
+ "append operations to files when the target subvolume falls "
+ "below min-free-(disk|inodes). When disabled, these are allowed "
+ "through and only new files will be affected.",
+ },
{ .key = {"min-free-inodes"},
.type = GF_OPTION_TYPE_PERCENT,
.default_value = "5%",
@@ -1089,5 +1110,14 @@ struct volume_options options[] = {
" associated with a file during rebalance"
},
+ { .key = {"du-refresh-interval-sec"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "60",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies how many seconds before subvolume statfs "
+ "info is re-validated."
+ },
+
{ .key = {NULL} },
};
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 56e17d6e884..996faffa37f 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this,
local);
}
- if (subvol != avail_subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (subvol != avail_subvol) {
/* create a link file instead of actual file */
local->params = dict_ref (params);
local->mode = mode;
@@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
local);
}
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
/* Create linkfile first */
local->params = dict_ref (params);
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index f1e9a399442..8b14ac99b8f 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this,
local);
}
- if (subvol != avail_subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (subvol != avail_subvol) {
/* create a link file instead of actual file */
local->mode = mode;
local->flags = flags;
@@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local);
}
- if (avail_subvol != subvol) {
+ if (!avail_subvol) {
+ op_errno = ENOSPC;
+ goto err;
+ } else if (avail_subvol != subvol) {
/* Create linkfile first */
local->params = dict_ref (params);
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index c21417a0192..0b5c095c3b4 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -35,6 +35,7 @@
#include "logging.h"
#include "cli1-xdr.h"
#include "statedump.h"
+#include "syncop.h"
#include <pwd.h>
#include <grp.h>
@@ -91,9 +92,13 @@ typedef struct _ios_sample_t {
uid_t uid;
gid_t gid;
char identifier[UNIX_PATH_MAX];
+ char path[UNIX_PATH_MAX];
glusterfs_fop_t fop_type;
struct timeval timestamp;
double elapsed;
+ gf_boolean_t have_path;
+ int32_t op_ret;
+ int32_t op_errno;
} ios_sample_t;
@@ -178,10 +183,33 @@ typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*,
int , int , uint64_t ) ;
struct ios_local {
- struct timeval wind_at;
- struct timeval unwind_at;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
};
+static struct ios_local *
+ios_local_new() {
+ return GF_CALLOC (1, sizeof (struct ios_local),
+ gf_common_mt_char);
+}
+
+static void
+ios_local_free (struct ios_local *local)
+{
+ if (!local)
+ return;
+
+ inode_unref (local->inode);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ loc_wipe (&local->loc);
+ memset (local, 0, sizeof (*local));
+ GF_FREE (local);
+}
+
struct volume_options options[];
static int
@@ -192,6 +220,57 @@ is_fop_latency_started (call_frame_t *frame)
return memcmp (&frame->begin, &epoch, sizeof (epoch));
}
+static void
+ios_free_local (call_frame_t *frame)
+{
+ struct ios_local *local = frame->local;
+
+ ios_local_free (local);
+
+ frame->local = NULL;
+}
+
+static void
+ios_track_loc (call_frame_t *frame, loc_t *loc)
+{
+ struct ios_local *local = NULL;
+
+ if (loc && loc->path) {
+ /* Check if frame->local is already set (it should
+ * only be set by either ios_track_loc() or
+ * ios_track_fd()). In other words, this check
+ * allows us to chain calls to ios_track_loc()
+ * and ios_track_fd() without clobbering frame->local
+ * in the process.
+ */
+ if (frame->local) {
+ local = frame->local;
+ } else {
+ local = ios_local_new ();
+ }
+ loc_copy (&local->loc, loc);
+ frame->local = local;
+ }
+}
+
+static void
+ios_track_fd (call_frame_t *frame, fd_t *fd)
+{
+ struct ios_local *local = NULL;
+
+ if (fd && fd->inode) {
+ if (frame->local) {
+ local = frame->local;
+ } else {
+ local = ios_local_new ();
+ }
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+ frame->local = local;
+ }
+}
+
+
#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"
#ifdef GF_LINUX_HOST_OS
#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats"
@@ -206,7 +285,7 @@ is_fop_latency_started (call_frame_t *frame)
conf = this->private; \
if (conf && conf->measure_latency) { \
gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op); \
+ update_ios_latency (conf, frame, GF_FOP_##op, 0, 0); \
} \
} while (0)
@@ -244,7 +323,7 @@ is_fop_latency_started (call_frame_t *frame)
#define STATS_ADD(x,i) (x) += (i)
#endif
-#define UPDATE_PROFILE_STATS(frame, op) \
+#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \
do { \
struct ios_conf *conf = NULL; \
\
@@ -257,7 +336,8 @@ is_fop_latency_started (call_frame_t *frame)
conf->count_fop_hits) { \
BUMP_FOP(op); \
gettimeofday (&frame->end, NULL); \
- update_ios_latency (conf, frame, GF_FOP_##op);\
+ update_ios_latency (conf, frame, GF_FOP_##op, \
+ op_ret, op_errno); \
} \
} \
STATS_UNLOCK (&conf->lock); \
@@ -647,7 +727,7 @@ ios_stats_cleanup (xlator_t *this, inode_t *inode)
fprintf (logfp, fmt); \
fprintf (logfp, "\n"); \
} \
- gf_log (this->name, GF_LOG_DEBUG, fmt); \
+ gf_log (this->name, GF_LOG_TRACE, fmt); \
} while (0)
int
@@ -694,7 +774,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this,
int
_io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
- char *key_root = "gluster";
+ char *key_root = "storage.gluster";
char *xlator_name = NULL;
char *instance_name = NULL;
size_t key_len = 0;
@@ -719,7 +799,7 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {
}
if (strcmp (__progname, "glusterfsd") == 0)
- key_root = "gluster.brick";
+ key_root = "storage.gluster.brick";
if (instance_name) {
/* +3 for 2 x "." + NULL */
@@ -779,6 +859,7 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
float fop_lat_min;
float fop_lat_max;
double interval_sec;
+ loc_t unused_loc = {0, };
interval_sec = ((now->tv_sec * 1000000.0 + now->tv_usec) -
(stats->started_at.tv_sec * 1000000.0 +
@@ -883,6 +964,29 @@ io_stats_dump_global_to_json_logfp (xlator_t *this,
"\"%s.%s.fop.%s.latency_max_usec\": \"%0.2lf\",",
key_prefix, str_prefix, lc_fop_name, fop_lat_max);
}
+
+ dict_t *xattr = NULL;
+ ret = syncop_getxattr (this, &unused_loc, &xattr,
+ IO_THREADS_QUEUE_SIZE_KEY, NULL, NULL);
+ if (xattr) {
+ // Iterate over the dictionary returned to us by io-threads and
+ // dump the results to the stats file.
+ data_pair_t *curr = NULL;
+ dict_for_each (xattr, curr) {
+ ios_log (this, logfp,
+ "\"%s.%s.%s.queue_size\": \"%d\",",
+ key_prefix, str_prefix, curr->key,
+ data_to_int32 (curr->value));
+ }
+
+ // Free the dictionary
+ dict_unref (xattr);
+ } else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to get queue size counts from "
+ "the io-threads translator!");
+ }
+
if (interval == -1) {
ios_log (this, logfp, "\"%s.%s.uptime\": \"%"PRId64"\",",
key_prefix, str_prefix,
@@ -1010,7 +1114,10 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
char *port_pos = NULL;
char *group_name = NULL;
char *username = NULL;
+ char *path = NULL;
struct ios_conf *conf = NULL;
+ const char *error_string = NULL;
+ int32_t op_errno = 0;
conf = this->private;
@@ -1057,12 +1164,22 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,
sprintf (group_name, "%d", (int32_t)sample->gid);
}
+ path = "Unknown";
+ if (sample->have_path)
+ path = sample->path;
+
+ error_string = "No Error";
+ if (sample->op_ret != 0) {
+ op_errno = abs (sample->op_errno);
+ error_string = strerror (op_errno);
+ }
+
ios_log (this, logfp,
- "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s",
+ "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s",
epoch_time, fop_enum_to_pri_string (sample->fop_type),
fop_enum_to_string (sample->fop_type),
sample->elapsed, xlator_name, instance_name, username,
- group_name, hostname, port);
+ group_name, hostname, port, path, op_errno, error_string);
goto out;
err:
gf_log (this->name, GF_LOG_ERROR,
@@ -1608,14 +1725,87 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd)
return 0;
}
+void ios_local_get_inode (struct ios_local *local, inode_t **inode)
+{
+ if (!local)
+ return;
+
+ /* In the cases that a loc is given to us,
+ * we should use that as the source of truth
+ * for the inode.
+ */
+ if (local->loc.inode) {
+ *inode = local->loc.inode;
+ return;
+ }
+
+ /* Fall back to the inode in the local struct,
+ * but there is no guarantee this will be a valid
+ * pointer.
+ */
+ *inode = local->inode;
+}
+
+void ios_local_get_path (call_frame_t *frame, const char **path)
+{
+ struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ ios_local_get_inode (local, &inode);
+
+ if (inode) {
+ /* Each inode shold have an iosstat struct attached to it.
+ * This is the preferred way to retrieve the path.
+ */
+ ios_inode_ctx_get (inode, frame->this, &iosstat);
+ if (iosstat) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "[%s] Getting path from iostat struct",
+ fop_enum_to_string (frame->op));
+ *path = iosstat->filename;
+ goto out;
+ }
+ }
+
+ /* If we don't have the iosstat attached to the inode,
+ * fall back to retrieving the path via the loc struct
+ * inside the local.
+ */
+ if (local->loc.path) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "[%s] Getting path from loc_t",
+ fop_enum_to_string (frame->op));
+ *path = local->loc.path;
+ goto out;
+ }
+
+out:
+ /* If the inode and the loc don't have the path, we're out of luck.
+ */
+ if (!*path) {
+ gf_log ("io-stats", GF_LOG_DEBUG,
+ "Unable to get path for fop: %s",
+ fop_enum_to_string (frame->op));
+ }
+
+ return;
+}
+
void collect_ios_latency_sample (struct ios_conf *conf,
glusterfs_fop_t fop_type, double elapsed,
- call_frame_t *frame)
+ call_frame_t *frame, int32_t op_ret, int32_t op_errno)
{
+ struct ios_local *ios_local = NULL;
ios_sample_buf_t *ios_sample_buf = NULL;
ios_sample_t *ios_sample = NULL;
struct timeval *timestamp = NULL;
call_stack_t *root = NULL;
+ const char *path = NULL;
ios_sample_buf = conf->ios_sample_buf;
@@ -1630,6 +1820,8 @@ void collect_ios_latency_sample (struct ios_conf *conf,
ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);
ios_sample->elapsed = elapsed;
ios_sample->fop_type = fop_type;
+ ios_sample->op_ret = op_ret;
+ ios_sample->op_errno = op_errno;
ios_sample->uid = root->uid;
ios_sample->gid = root->gid;
(ios_sample->timestamp).tv_sec = timestamp->tv_sec;
@@ -1637,6 +1829,52 @@ void collect_ios_latency_sample (struct ios_conf *conf,
memcpy (&ios_sample->identifier, &root->identifier,
sizeof (root->identifier));
+ /* Eventually every FOP will be supported
+ * (i.e., the frame->local will be
+ * of type struct ios_local), but for now, this is a safety.
+ */
+ switch (ios_sample->fop_type) {
+
+ case GF_FOP_CREATE:
+ case GF_FOP_OPEN:
+ case GF_FOP_STAT:
+ case GF_FOP_FSTAT:
+ case GF_FOP_READ:
+ case GF_FOP_WRITE:
+ case GF_FOP_OPENDIR:
+ case GF_FOP_READDIRP:
+ case GF_FOP_READDIR:
+ case GF_FOP_FLUSH:
+ case GF_FOP_ACCESS:
+ case GF_FOP_UNLINK:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RMDIR:
+ case GF_FOP_SETATTR:
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FXATTROP:
+ case GF_FOP_XATTROP:
+ case GF_FOP_GETXATTR:
+ case GF_FOP_FGETXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_STATFS:
+ case GF_FOP_FSYNC:
+ ios_local_get_path (frame, &path);
+ break;
+ default:
+ path = NULL;
+ break;
+ }
+
+ if (path) {
+ strncpy (ios_sample->path, path, sizeof (ios_sample->path));
+ ios_sample->have_path = _gf_true;
+ }
+
/* We've reached the end of the circular buffer, start from the
* beginning. */
if (ios_sample_buf->pos == (ios_sample_buf->size - 1))
@@ -1674,7 +1912,7 @@ update_ios_latency_stats (struct ios_global_stats *stats, double elapsed,
int
update_ios_latency (struct ios_conf *conf, call_frame_t *frame,
- glusterfs_fop_t op)
+ glusterfs_fop_t op, int32_t op_ret, int32_t op_errno)
{
double elapsed;
struct timeval *begin, *end;
@@ -1687,7 +1925,7 @@ update_ios_latency (struct ios_conf *conf, call_frame_t *frame,
update_ios_latency_stats (&conf->cumulative, elapsed, op);
update_ios_latency_stats (&conf->incremental, elapsed, op);
- collect_ios_latency_sample (conf, op, elapsed, frame);
+ collect_ios_latency_sample (conf, op, elapsed, frame, op_ret, op_errno);
return 0;
}
@@ -1811,40 +2049,100 @@ unlock_list_head:
return ret;
}
+static int
+attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path,
+ const uuid_t gfid) {
+ struct ios_stat *iosstat = NULL;
+
+ if (!inode) {
+ return -EINVAL;
+ }
+
+ ios_inode_ctx_get (inode, this, &iosstat);
+ if (!iosstat) {
+ iosstat = GF_CALLOC (1, sizeof (*iosstat),
+ gf_io_stats_mt_ios_stat);
+ if (!iosstat) {
+ return -ENOMEM;
+ }
+ iosstat->filename = gf_strdup (path);
+ gf_uuid_copy (iosstat->gfid, gfid);
+ LOCK_INIT (&iosstat->lock);
+ ios_inode_ctx_set (inode, this, iosstat);
+ }
+
+ return 0;
+}
+
+
+int
+ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd)
+{
+ struct ios_fd *ifd = NULL;
+ int ret = 0;
+
+ ifd = GF_CALLOC (1, sizeof (*ifd), gf_io_stats_mt_ios_fd);
+ if (!ifd) {
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+
+ if (path) {
+ ifd->filename = gf_strdup (path);
+ if (!ifd->filename) {
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+ }
+
+ gettimeofday (&ifd->opened_at, NULL);
+
+ if (fd)
+ ios_fd_ctx_set (fd, this, ifd);
+
+ *iosfd = ifd;
+
+ return ret;
+
+ /* Failure path */
+free_and_out:
+ if (ifd) {
+ GF_FREE (ifd->filename);
+ GF_FREE (ifd);
+ }
+
+ *iosfd = NULL;
+
+ return ret;
+}
+
+
int
io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
- struct ios_stat *iosstat = NULL;
- struct ios_conf *conf = NULL;
-
- conf = this->private;
+ struct ios_local *local = NULL;
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
- path = frame->local;
- frame->local = NULL;
-
- if (!path)
+ if (op_ret < 0) {
goto unwind;
+ }
- if (op_ret < 0) {
- GF_FREE (path);
+ local = frame->local;
+ if (!local) {
goto unwind;
}
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
+ conf = this->private;
+
+ ios_build_fd (this, local->loc.path, fd, &iosfd);
if (!iosfd) {
- GF_FREE (path);
goto unwind;
}
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
-
- ios_fd_ctx_set (fd, this, iosfd);
LOCK (&conf->lock);
{
conf->cumulative.nr_opens++;
@@ -1855,18 +2153,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&conf->lock);
- iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat);
- if (!iosstat) {
- GF_FREE (path);
- goto unwind;
- }
- iosstat->filename = gf_strdup (path);
- gf_uuid_copy (iosstat->gfid, buf->ia_gfid);
- LOCK_INIT (&iosstat->lock);
- ios_inode_ctx_set (fd->inode, this, iosstat);
+ attach_iosstat_to_inode (this, local->loc.inode, local->loc.path,
+ buf->ia_gfid);
unwind:
- UPDATE_PROFILE_STATS (frame, CREATE);
+ UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -1877,44 +2169,24 @@ int
io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- struct ios_fd *iosfd = NULL;
- char *path = NULL;
- struct ios_stat *iosstat = NULL;
- struct ios_conf *conf = NULL;
-
- conf = this->private;
- path = frame->local;
- frame->local = NULL;
-
- if (!path)
- goto unwind;
+ struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
+ struct ios_conf *conf = NULL;
+ struct ios_fd *iosfd = NULL;
if (op_ret < 0) {
- GF_FREE (path);
goto unwind;
}
- iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd);
- if (!iosfd) {
- GF_FREE (path);
+ local = frame->local;
+ if (!local) {
goto unwind;
}
- iosfd->filename = path;
- gettimeofday (&iosfd->opened_at, NULL);
-
- ios_fd_ctx_set (fd, this, iosfd);
-
- ios_inode_ctx_get (fd->inode, this, &iosstat);
- if (!iosstat) {
- iosstat = GF_CALLOC (1, sizeof (*iosstat),
- gf_io_stats_mt_ios_stat);
- if (iosstat) {
- iosstat->filename = gf_strdup (path);
- gf_uuid_copy (iosstat->gfid, fd->inode->gfid);
- LOCK_INIT (&iosstat->lock);
- ios_inode_ctx_set (fd->inode, this, iosstat);
- }
+ conf = this->private;
+ ios_build_fd (this, local->loc.path, fd, &iosfd);
+ if (!iosfd) {
+ goto unwind;
}
LOCK (&conf->lock);
@@ -1926,13 +2198,19 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
UNLOCK (&conf->lock);
+
+ ios_inode_ctx_get (fd->inode, this, &iosstat);
if (iosstat) {
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN);
- iosstat = NULL;
}
-unwind:
- UPDATE_PROFILE_STATS (frame, OPEN);
+ attach_iosstat_to_inode (this, local->loc.inode,
+ local->loc.path,
+ local->loc.inode->gfid);
+
+unwind:
+ UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
return 0;
@@ -1943,7 +2221,8 @@ int
io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, STAT);
+ UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -1956,26 +2235,29 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iobref *iobref, dict_t *xdata)
{
int len = 0;
- fd_t *fd = NULL;
struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
- fd = frame->local;
- frame->local = NULL;
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
if (op_ret > 0) {
len = iov_length (vector, count);
- BUMP_READ (fd, len);
+ BUMP_READ (local->fd, len);
}
- UPDATE_PROFILE_STATS (frame, READ);
- ios_inode_ctx_get (fd->inode, this, &iosstat);
+ UPDATE_PROFILE_STATS (frame, READ, op_ret, op_errno);
+ ios_inode_ctx_get (local->fd->inode, this, &iosstat);
if (iosstat) {
- BUMP_STATS (iosstat, IOS_STATS_TYPE_READ);
- BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ);
- iosstat = NULL;
+ BUMP_STATS (iosstat, IOS_STATS_TYPE_READ);
+ BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ);
+
}
+unwind:
+ ios_free_local (frame);
STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
vector, count, buf, iobref, xdata);
return 0;
@@ -1989,21 +2271,23 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
struct ios_stat *iosstat = NULL;
+ struct ios_local *local = NULL;
inode_t *inode = NULL;
- UPDATE_PROFILE_STATS (frame, WRITE);
- if (frame->local){
- inode = frame->local;
- frame->local = NULL;
- ios_inode_ctx_get (inode, this, &iosstat);
- if (iosstat) {
- BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE);
- BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE);
- inode = NULL;
- iosstat = NULL;
- }
- }
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
+ UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno);
+
+ ios_inode_ctx_get (local->inode, this, &iosstat);
+
+ if (iosstat) {
+ BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE);
+ BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE);
+ }
+unwind:
+ ios_free_local (frame);
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
return 0;
@@ -2021,7 +2305,7 @@ io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
frame->local = NULL;
- UPDATE_PROFILE_STATS (frame, READDIRP);
+ UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno);
ios_inode_ctx_get (inode, this, &iosstat);
@@ -2039,7 +2323,16 @@ int
io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, READDIR);
+ struct ios_local *local = NULL;
+ struct ios_stat *iosstat = NULL;
+
+ local = frame->local;
+
+ UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);
+
+ ios_free_local (frame);
+
+ UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);
STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2050,8 +2343,10 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSYNC);
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
return 0;
}
@@ -2061,7 +2356,8 @@ io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SETATTR);
+ UPDATE_PROFILE_STATS (frame, SETATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, xdata);
return 0;
}
@@ -2072,7 +2368,8 @@ io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, UNLINK);
+ UPDATE_PROFILE_STATS (frame, UNLINK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
preparent, postparent, xdata);
return 0;
@@ -2086,7 +2383,7 @@ io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preoldparent, struct iatt *postoldparent,
struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, RENAME);
+ UPDATE_PROFILE_STATS (frame, RENAME, op_ret, op_errno);
STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
preoldparent, postoldparent,
prenewparent, postnewparent, xdata);
@@ -2099,7 +2396,8 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, const char *buf,
struct iatt *sbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, READLINK);
+ UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
return 0;
}
@@ -2111,7 +2409,14 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
dict_t *xdata, struct iatt *postparent)
{
- UPDATE_PROFILE_STATS (frame, LOOKUP);
+ struct ios_local *local = frame->local;
+
+ if (local && local->loc.path && inode && op_ret >= 0) {
+ attach_iosstat_to_inode (this, inode, local->loc.path,
+ inode->gfid);
+ }
+ UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,
postparent);
return 0;
@@ -2124,7 +2429,7 @@ io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SYMLINK);
+ UPDATE_PROFILE_STATS (frame, SYMLINK, op_ret, op_errno);
STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2137,7 +2442,7 @@ io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, MKNOD);
+ UPDATE_PROFILE_STATS (frame, MKNOD, op_ret, op_errno);
STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2151,28 +2456,16 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata)
{
- struct ios_stat *iosstat = NULL;
- char *path = frame->local;
+ struct ios_local *local = frame->local;
- if (!path)
- goto unwind;
-
- UPDATE_PROFILE_STATS (frame, MKDIR);
- if (op_ret < 0)
- goto unwind;
-
- iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat);
- if (iosstat) {
- LOCK_INIT (&iosstat->lock);
- iosstat->filename = gf_strdup(path);
- gf_uuid_copy (iosstat->gfid, buf->ia_gfid);
- ios_inode_ctx_set (inode, this, iosstat);
+ if (local && local->loc.path) {
+ local->inode = inode_ref (inode);
+ attach_iosstat_to_inode (this, inode, local->loc.path,
+ buf->ia_gfid);
}
-unwind:
- /* local is assigned with path */
- GF_FREE (frame->local);
- frame->local = NULL;
+ UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2185,7 +2478,7 @@ io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, LINK);
+ UPDATE_PROFILE_STATS (frame, LINK, op_ret, op_errno);
STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
preparent, postparent, xdata);
return 0;
@@ -2196,7 +2489,8 @@ int
io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FLUSH);
+ UPDATE_PROFILE_STATS (frame, FLUSH, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2206,20 +2500,28 @@ int
io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- struct ios_stat *iosstat = NULL;
- int ret = -1;
+ struct ios_local *local = NULL;
+ struct ios_stat *iosstat = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ if (!local || !local->fd)
+ goto unwind;
- UPDATE_PROFILE_STATS (frame, OPENDIR);
if (op_ret < 0)
goto unwind;
- ios_fd_ctx_set (fd, this, 0);
+ attach_iosstat_to_inode (this, local->inode, local->loc.path,
+ local->inode->gfid);
- ret = ios_inode_ctx_get (fd->inode, this, &iosstat);
- if (!ret)
+ ios_fd_ctx_set (local->fd, this, 0);
+ ios_inode_ctx_get (local->fd->inode, this, &iosstat);
+ if (iosstat)
BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR);
unwind:
+ UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
return 0;
}
@@ -2231,8 +2533,8 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, RMDIR);
-
+ UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
preparent, postparent, xdata);
return 0;
@@ -2244,7 +2546,8 @@ io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, TRUNCATE);
+ UPDATE_PROFILE_STATS (frame, TRUNCATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
prebuf, postbuf, xdata);
return 0;
@@ -2255,7 +2558,8 @@ int
io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, STATFS);
+ UPDATE_PROFILE_STATS (frame, STATFS, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2265,7 +2569,8 @@ int
io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, SETXATTR);
+ UPDATE_PROFILE_STATS (frame, SETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2275,7 +2580,8 @@ int
io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, GETXATTR);
+ UPDATE_PROFILE_STATS (frame, GETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2285,7 +2591,8 @@ int
io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, REMOVEXATTR);
+ UPDATE_PROFILE_STATS (frame, REMOVEXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2294,7 +2601,8 @@ int
io_stats_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSETXATTR);
+ UPDATE_PROFILE_STATS (frame, FSETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2304,7 +2612,8 @@ int
io_stats_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FGETXATTR);
+ UPDATE_PROFILE_STATS (frame, FGETXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2314,7 +2623,8 @@ int
io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FREMOVEXATTR);
+ UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2324,7 +2634,8 @@ int
io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSYNCDIR);
+ UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2334,7 +2645,20 @@ int
io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, ACCESS);
+ struct ios_local *local = frame->local;
+
+ /* ACCESS is called before a READ when a fop fails over
+ * in NFS. We need to make sure that we are attaching the
+ * data correctly to this inode.
+ */
+ if (local->loc.inode && local->loc.path) {
+ attach_iosstat_to_inode (this, local->loc.inode,
+ local->loc.path,
+ local->loc.inode->gfid);
+ }
+
+ UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2345,7 +2669,8 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FTRUNCATE);
+ UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,
prebuf, postbuf, xdata);
return 0;
@@ -2356,7 +2681,8 @@ int
io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FSTAT);
+ UPDATE_PROFILE_STATS (frame, FSTAT, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
@@ -2367,8 +2693,9 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, FALLOCATE);
- STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -2379,8 +2706,9 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, DISCARD);
- STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno);
+ ios_free_local (frame);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
}
@@ -2390,7 +2718,8 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- UPDATE_PROFILE_STATS(frame, ZEROFILL);
+ UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
return 0;
@@ -2400,7 +2729,8 @@ int
io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, LK);
+ UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);
return 0;
}
@@ -2410,7 +2740,8 @@ int
io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, ENTRYLK);
+ UPDATE_PROFILE_STATS (frame, ENTRYLK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2420,7 +2751,8 @@ int
io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, XATTROP);
+ UPDATE_PROFILE_STATS (frame, XATTROP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2430,7 +2762,8 @@ int
io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, FXATTROP);
+ UPDATE_PROFILE_STATS (frame, FXATTROP, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -2440,7 +2773,8 @@ int
io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- UPDATE_PROFILE_STATS (frame, INODELK);
+ UPDATE_PROFILE_STATS (frame, INODELK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2450,6 +2784,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_entrylk_cbk,
@@ -2464,6 +2800,7 @@ int
io_stats_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
@@ -2479,8 +2816,8 @@ int
io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
-
- UPDATE_PROFILE_STATS (frame, FINODELK);
+ UPDATE_PROFILE_STATS (frame, FINODELK, op_ret, op_errno);
+ ios_free_local (frame);
STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2490,6 +2827,7 @@ int
io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_finodelk_cbk,
@@ -2504,6 +2842,7 @@ int
io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_xattrop_cbk,
@@ -2518,6 +2857,7 @@ int
io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fxattrop_cbk,
@@ -2532,6 +2872,7 @@ int
io_stats_lookup (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_lookup_cbk,
@@ -2545,6 +2886,7 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this,
int
io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_stat_cbk,
@@ -2559,6 +2901,7 @@ int
io_stats_readlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, size_t size, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_readlink_cbk,
@@ -2573,6 +2916,7 @@ int
io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_mknod_cbk,
@@ -2587,9 +2931,7 @@ int
io_stats_mkdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
-
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_mkdir_cbk,
@@ -2604,6 +2946,7 @@ int
io_stats_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_unlink_cbk,
@@ -2618,6 +2961,7 @@ int
io_stats_rmdir (call_frame_t *frame, xlator_t *this,
loc_t *loc, int flags, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_rmdir_cbk,
@@ -2674,6 +3018,7 @@ int
io_stats_setattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_setattr_cbk,
@@ -2688,6 +3033,7 @@ int
io_stats_truncate (call_frame_t *frame, xlator_t *this,
loc_t *loc, off_t offset, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_truncate_cbk,
@@ -2702,8 +3048,8 @@ int
io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t flags, fd_t *fd, dict_t *xdata)
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
+ ios_track_loc (frame, loc);
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
@@ -2719,9 +3065,10 @@ int
io_stats_create (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, mode_t mode,
mode_t umask, fd_t *fd, dict_t *xdata)
+
{
- if (loc->path)
- frame->local = gf_strdup (loc->path);
+ ios_track_loc (frame, loc);
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
@@ -2737,8 +3084,7 @@ int
io_stats_readv (call_frame_t *frame, xlator_t *this,
fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
- frame->local = fd;
-
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_readv_cbk,
@@ -2756,9 +3102,12 @@ io_stats_writev (call_frame_t *frame, xlator_t *this,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
int len = 0;
+ struct ios_conf *conf = NULL;
+ struct ios_local *local = NULL;
+ int ret = 0;
+
+ ios_track_fd (frame, fd);
- if (fd->inode)
- frame->local = fd->inode;
len = iov_length (vector, count);
BUMP_WRITE (fd, len);
@@ -2777,6 +3126,7 @@ int
io_stats_statfs (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_statfs_cbk,
@@ -2791,6 +3141,7 @@ int
io_stats_flush (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_flush_cbk,
@@ -2805,6 +3156,7 @@ int
io_stats_fsync (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t flags, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fsync_cbk,
@@ -2971,7 +3323,7 @@ _ios_dump_thread (xlator_t *this) {
stats_filename, strerror(errno));
log_stats_fopen_failure = _gf_false;
}
- samples_logfp = fopen (samples_filename, "w+");
+ samples_logfp = fopen (samples_filename, "a");
if (samples_logfp) {
io_stats_dump_latency_samples_logfp (this,
samples_logfp);
@@ -3024,6 +3376,8 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this,
goto out;
}
+ ios_track_loc (frame, loc);
+
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_setxattr_cbk,
@@ -3042,6 +3396,7 @@ int
io_stats_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_getxattr_cbk,
@@ -3056,6 +3411,7 @@ int
io_stats_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_removexattr_cbk,
@@ -3071,6 +3427,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *dict,
int32_t flags, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fsetxattr_cbk,
@@ -3085,6 +3442,7 @@ int
io_stats_fgetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fgetxattr_cbk,
@@ -3099,6 +3457,7 @@ int
io_stats_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fremovexattr_cbk,
@@ -3170,6 +3529,7 @@ int
io_stats_access (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t mask, dict_t *xdata)
{
+ ios_track_loc (frame, loc);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_access_cbk,
@@ -3212,6 +3572,7 @@ int
io_stats_fstat (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
+ ios_track_fd (frame, fd);
START_FOP_LATENCY (frame);
STACK_WIND (frame, io_stats_fstat_cbk,
diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
index 270632bc71b..2eb3a9f9149 100644
--- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c
+++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
@@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc,
case RPC_CLNT_DISCONNECT:
case RPC_CLNT_MSG:
case RPC_CLNT_DESTROY:
+ case RPC_CLNT_PING:
break;
}
diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c
index 77637c7beec..459d173db7f 100644
--- a/xlators/features/changelog/src/changelog-ev-handle.c
+++ b/xlators/features/changelog/src/changelog-ev-handle.c
@@ -180,6 +180,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc,
/* Free up mydata */
changelog_rpc_clnt_unref (crpc);
break;
+ case RPC_CLNT_PING:
+ break;
}
return 0;
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index 640c6bb5553..d7c210f24a5 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -234,6 +234,7 @@ blkd:
continue;
bcount++;
+ list_del_init (&ilock->client_list);
list_del_init (&ilock->blocked_locks);
list_add (&ilock->blocked_locks, &released);
}
@@ -268,6 +269,7 @@ granted:
continue;
gcount++;
+ list_del_init (&ilock->client_list);
list_del_init (&ilock->list);
list_add (&ilock->list, &released);
}
@@ -321,6 +323,7 @@ blkd:
bcount++;
+ list_del_init (&elock->client_list);
list_del_init (&elock->blocked_locks);
list_add_tail (&elock->blocked_locks, &released);
}
@@ -355,6 +358,7 @@ granted:
}
gcount++;
+ list_del_init (&elock->client_list);
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index 68904f63140..8a56c4205d9 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -1108,3 +1108,16 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)
return conf;
}
+
+gf_boolean_t
+pl_does_monkey_want_stuck_lock()
+{
+ long int monkey_unlock_rand = 0;
+ long int monkey_unlock_rand_rem = 0;
+
+ monkey_unlock_rand = random ();
+ monkey_unlock_rand_rem = monkey_unlock_rand % 100;
+ if (monkey_unlock_rand_rem == 0)
+ return _gf_true;
+ return _gf_false;
+}
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 5486f9b8314..3729ca24bed 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -161,4 +161,7 @@ pl_metalock_is_active (pl_inode_t *pl_inode);
int
__pl_queue_lock (pl_inode_t *pl_inode, posix_lock_t *reqlock, int can_block);
+
+gf_boolean_t
+pl_does_monkey_want_stuck_lock();
#endif /* __COMMON_H__ */
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index 783c57e6381..626541237b3 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -16,9 +16,9 @@
#include "list.h"
#include "locks.h"
+#include "clear.h"
#include "common.h"
-
void
__pl_entrylk_unref (pl_entry_lock_t *lock)
{
@@ -111,6 +111,97 @@ __conflicting_entrylks (pl_entry_lock_t *l1, pl_entry_lock_t *l2)
return 0;
}
+/* See comments in inodelk.c for details */
+static inline gf_boolean_t
+__stale_entrylk (xlator_t *this, pl_entry_lock_t *candidate_lock,
+ pl_entry_lock_t *requested_lock, time_t *lock_age_sec)
+{
+ posix_locks_private_t *priv = NULL;
+ struct timeval curr;
+ gettimeofday (&curr, NULL);
+
+ priv = this->private;
+
+ /* Question: Should we just prune them all given the
+ * chance? Or just the locks we are attempting to acquire?
+ */
+ if (names_conflict (candidate_lock->basename,
+ requested_lock->basename)) {
+ *lock_age_sec = curr.tv_sec -
+ candidate_lock->granted_time.tv_sec;
+ if (*lock_age_sec > priv->revocation_secs)
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+/* See comments in inodelk.c for details */
+static gf_boolean_t
+__entrylk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_entry_lock_t *lock)
+{
+ posix_locks_private_t *priv = NULL;
+ pl_entry_lock_t *tmp = NULL;
+ pl_entry_lock_t *lk = NULL;
+ gf_boolean_t revoke_lock = _gf_false;
+ int bcount = 0;
+ int gcount = 0;
+ int op_errno = 0;
+ clrlk_args args;
+ args.opts = NULL;
+ time_t lk_age_sec = 0;
+ uint32_t max_blocked = 0;
+ char *reason_str = NULL;
+
+ priv = this->private;
+ args.type = CLRLK_ENTRY;
+ if (priv->revocation_clear_all == _gf_true)
+ args.kind = CLRLK_ALL;
+ else
+ args.kind = CLRLK_GRANTED;
+
+
+ if (list_empty (&dom->entrylk_list))
+ goto out;
+
+ pthread_mutex_lock (&pinode->mutex);
+ lock->pinode = pinode;
+ list_for_each_entry_safe (lk, tmp, &dom->entrylk_list, domain_list) {
+ if (__stale_entrylk (this, lk, lock, &lk_age_sec) == _gf_true) {
+ revoke_lock = _gf_true;
+ reason_str = "age";
+ break;
+ }
+ }
+ max_blocked = priv->revocation_max_blocked;
+ if (max_blocked != 0 && revoke_lock == _gf_false) {
+ list_for_each_entry_safe (lk, tmp, &dom->blocked_entrylks,
+ blocked_locks) {
+ max_blocked--;
+ if (max_blocked == 0) {
+ revoke_lock = _gf_true;
+ reason_str = "max blocked";
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+
+out:
+ if (revoke_lock == _gf_true) {
+ clrlk_clear_entrylk (this, pinode, dom, &args, &bcount, &gcount,
+ &op_errno);
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+ "age: %ld sec] - Entry lock revoked: %d granted & %d "
+ "blocked locks cleared", reason_str,
+ uuid_utoa (pinode->gfid), dom->domain, lk_age_sec,
+ gcount, bcount);
+ }
+
+ return revoke_lock;
+}
+
/**
* entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
@@ -546,6 +637,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
pl_ctx_t *ctx = NULL;
int nonblock = 0;
gf_boolean_t need_inode_unref = _gf_false;
+ posix_locks_private_t *priv = NULL;
+
+ priv = this->private;
if (xdata)
dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
@@ -599,6 +693,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
* current stack unwinds.
*/
pinode->inode = inode_ref (inode);
+ if (priv->revocation_secs != 0) {
+ if (cmd != ENTRYLK_UNLOCK) {
+ __entrylk_prune_stale (this, pinode, dom, reqlock);
+ } else if (priv->monkey_unlocking == _gf_true) {
+ if (pl_does_monkey_want_stuck_lock ()) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "MONKEY LOCKING (forcing stuck lock)!");
+ op_ret = 0;
+ goto out;
+ }
+ }
+ }
switch (cmd) {
case ENTRYLK_LOCK_NB:
@@ -678,8 +784,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
"a bug report at http://bugs.gluster.com", cmd);
goto out;
}
- if (need_inode_unref)
- inode_unref (pinode->inode);
/* The following (extra) unref corresponds to the ref that
* was done at the time the lock was granted.
@@ -689,6 +793,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
out:
+ if (need_inode_unref)
+ inode_unref (pinode->inode);
+
if (unwind) {
entrylk_trace_out (this, frame, volume, fd, loc, basename,
cmd, type, op_ret, op_errno);
@@ -810,6 +917,8 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
* blocked to avoid leaving L1 to starve forever.
* iv. unref the object.
*/
+ list_del_init (&l->client_list);
+
if (!list_empty (&l->domain_list)) {
list_del_init (&l->domain_list);
list_add_tail (&l->client_list,
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 1564f26b8fb..275fb9d20e4 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -16,6 +16,7 @@
#include "list.h"
#include "locks.h"
+#include "clear.h"
#include "common.h"
void
@@ -130,6 +131,105 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
inodelk_type_conflict (l1, l2));
}
+/*
+ * Check to see if the candidate lock overlaps/conflicts with the
+ * requested lock. If so, determine how old the lock is and return
+ * true if it exceeds the configured threshold, false otherwise.
+ */
+static inline gf_boolean_t
+__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock,
+ pl_inode_lock_t *requested_lock, time_t *lock_age_sec)
+{
+ posix_locks_private_t *priv = NULL;
+ struct timeval curr;
+
+ priv = this->private;
+ gettimeofday (&curr, NULL);
+ /* Question: Should we just prune them all given the
+ * chance? Or just the locks we are attempting to acquire?
+ */
+ if (inodelk_conflict (candidate_lock, requested_lock)) {
+ *lock_age_sec = curr.tv_sec -
+ candidate_lock->granted_time.tv_sec;
+ if (*lock_age_sec > priv->revocation_secs)
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+/* Examine any locks held on this inode and potentially revoke the lock
+ * if the age exceeds revocation_secs. We will clear _only_ those locks
+ * which are granted, and then grant those locks which are blocked.
+ *
+ * Depending on how this patch works in the wild, we may expand this and
+ * introduce a heuristic which clears blocked locks as well if they
+ * are beyond a threshold.
+ */
+static gf_boolean_t
+__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+ pl_inode_lock_t *lock)
+{
+ posix_locks_private_t *priv = NULL;
+ pl_inode_lock_t *tmp = NULL;
+ pl_inode_lock_t *lk = NULL;
+ gf_boolean_t revoke_lock = _gf_false;
+ int bcount = 0;
+ int gcount = 0;
+ int op_errno = 0;
+ clrlk_args args;
+ args.opts = NULL;
+ time_t lk_age_sec = 0;
+ uint32_t max_blocked = 0;
+ char *reason_str = NULL;
+
+ priv = this->private;
+
+ args.type = CLRLK_INODE;
+ if (priv->revocation_clear_all == _gf_true)
+ args.kind = CLRLK_ALL;
+ else
+ args.kind = CLRLK_GRANTED;
+
+ if (list_empty (&dom->inodelk_list))
+ goto out;
+
+ pthread_mutex_lock (&pinode->mutex);
+ list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) {
+ if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) {
+ revoke_lock = _gf_true;
+ reason_str = "age";
+ break;
+ }
+ }
+
+ max_blocked = priv->revocation_max_blocked;
+ if (max_blocked != 0 && revoke_lock == _gf_false) {
+ list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks,
+ blocked_locks) {
+ max_blocked--;
+ if (max_blocked == 0) {
+ revoke_lock = _gf_true;
+ reason_str = "max blocked";
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+
+out:
+ if (revoke_lock == _gf_true) {
+ clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount,
+ &op_errno);
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+ "age: %ld sec] - Inode lock revoked: %d granted & %d "
+ "blocked locks cleared",
+ reason_str, uuid_utoa (pinode->gfid), dom->domain,
+ lk_age_sec, gcount, bcount);
+ }
+ return revoke_lock;
+}
+
/* Determine if lock is grantable or not */
static pl_inode_lock_t *
__inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)
@@ -419,8 +519,6 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
client_list) {
- list_del_init (&l->client_list);
-
pl_inodelk_log_cleanup (l);
pl_inode = l->pl_inode;
@@ -458,6 +556,8 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
* forever.
* iv. unref the object.
*/
+ list_del_init (&l->client_list);
+
if (!list_empty (&l->list)) {
__delete_inode_lock (l);
list_add_tail (&l->client_list,
@@ -509,6 +609,7 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
inode_t *inode)
{
+ posix_locks_private_t *priv = NULL;
int ret = -EINVAL;
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
@@ -518,6 +619,8 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
lock->pl_inode = pl_inode;
fl_type = lock->fl_type;
+ priv = this->private;
+
/* Ideally, AFTER a successful lock (both blocking and non-blocking) or
* an unsuccessful blocking lock operation, the inode needs to be ref'd.
*
@@ -537,6 +640,18 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
*/
pl_inode->inode = inode_ref (inode);
+ if (priv->revocation_secs != 0) {
+ if (lock->fl_type != F_UNLCK) {
+ __inodelk_prune_stale (this, pl_inode, dom, lock);
+ } else if (priv->monkey_unlocking == _gf_true) {
+ if (pl_does_monkey_want_stuck_lock ()) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "MONKEY LOCKING (forcing stuck lock)!");
+ return 0;
+ }
+ }
+ }
+
if (ctx)
pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index e363f425b65..8eb35da44be 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -190,6 +190,10 @@ typedef struct {
mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */
gf_boolean_t trace; /* trace lock requests in and out */
char *brickname;
+ gf_boolean_t monkey_unlocking;
+ uint32_t revocation_secs;
+ gf_boolean_t revocation_clear_all;
+ uint32_t revocation_max_blocked;
} posix_locks_private_t;
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index f217220a04b..616be0f7cff 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -3627,7 +3627,21 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("trace", priv->trace, options, bool, out);
+ GF_OPTION_RECONF ("monkey-unlocking", priv->monkey_unlocking, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("revocation-secs",
+ priv->revocation_secs, options,
+ uint32, out);
+
+ GF_OPTION_RECONF ("revocation-clear-all", priv->revocation_clear_all,
+ options, bool, out);
+
+ GF_OPTION_RECONF ("revocation-max-blocked",
+ priv->revocation_max_blocked, options,
+ uint32, out);
ret = 0;
+
out:
return ret;
}
@@ -3678,6 +3692,18 @@ init (xlator_t *this)
GF_OPTION_INIT ("trace", priv->trace, bool, out);
+ GF_OPTION_INIT ("monkey-unlocking", priv->monkey_unlocking,
+ bool, out);
+
+ GF_OPTION_INIT ("revocation-secs", priv->revocation_secs,
+ uint32, out);
+
+ GF_OPTION_INIT ("revocation-clear-all", priv->revocation_clear_all,
+ bool, out);
+
+ GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked,
+ uint32, out);
+
this->local_pool = mem_pool_new (pl_local_t, 32);
if (!this->local_pool) {
ret = -1;
@@ -3934,5 +3960,35 @@ struct volume_options options[] = {
.description = "Trace the different lock requests "
"to logs."
},
+ { .key = { "monkey-unlocking" },
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Ignore a random number of unlock requests. Useful "
+ "for testing/creating robust lock recovery mechanisms."
+ },
+ { .key = {"revocation-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "0",
+ .description = "Maximum time a lock can be taken out, before"
+ "being revoked.",
+ },
+ { .key = {"revocation-clear-all"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "If set to true, will revoke BOTH granted and blocked "
+ "(pending) lock requests if a revocation threshold is "
+ "hit.",
+ },
+ { .key = {"revocation-max-blocked"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "0",
+ .description = "A number of blocked lock requests after which a lock "
+ "will be revoked to allow the others to proceed. Can "
+ "be used in conjunction w/ revocation-clear-all."
+ },
{ .key = {NULL} },
};
diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c
index f578f6c3f44..9201f38f7ff 100644
--- a/xlators/features/marker/src/marker.c
+++ b/xlators/features/marker/src/marker.c
@@ -390,13 +390,6 @@ _is_quota_internal_xattr (dict_t *d, char *k, data_t *v, void *data)
if (fnmatch ("trusted.glusterfs.quota*", k, 0) == 0)
return _gf_true;
- /* It would be nice if posix filters pgfid xattrs. But since marker
- * also takes up responsibility to clean these up, adding the filtering
- * here (Check 'quota_xattr_cleaner')
- */
- if (fnmatch (PGFID_XATTR_KEY_PREFIX"*", k, 0) == 0)
- return _gf_true;
-
return _gf_false;
}
@@ -1598,9 +1591,10 @@ marker_get_oldpath_contribution (call_frame_t *lk_frame, void *cookie,
*/
MARKER_SET_UID_GID (frame, local, frame->root);
- if (gf_uuid_is_null (oplocal->loc.gfid))
- gf_uuid_copy (oplocal->loc.gfid,
- oplocal->loc.inode->gfid);
+ if (gf_uuid_is_null (oplocal->loc.gfid)) {
+ gf_uuid_copy (oplocal->loc.gfid,
+ oplocal->loc.inode->gfid);
+ }
GF_UUID_ASSERT (oplocal->loc.gfid);
diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c
index dd7bf809e21..2e68b318a9c 100644
--- a/xlators/features/quota/src/quota.c
+++ b/xlators/features/quota/src/quota.c
@@ -2200,7 +2200,7 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_msg (this->name, GF_LOG_INFO, EINVAL,
Q_MSG_INODE_CTX_GET_FAILED,
"quota context not set inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ uuid_utoa (local->loc.gfid));
goto out;
}
diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c
index fc2ff2ab10d..f5062971bf4 100644
--- a/xlators/features/snapview-server/src/snapview-server-mgmt.c
+++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c
@@ -73,7 +73,7 @@ svs_mgmt_init (xlator_t *this)
if (cmd_args->volfile_server)
host = cmd_args->volfile_server;
- ret = rpc_transport_inet_options_build (&options, host, port);
+ ret = rpc_transport_inet_options_build (&options, host, port, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to build the "
"transport options");
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index bf62290d023..1770d9dd874 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -3418,7 +3418,7 @@ glusterd_transport_keepalive_options_get (int *interval, int *time,
int
glusterd_transport_inet_options_build (dict_t **options, const char *hostname,
- int port)
+ int port, char *addr_family)
{
dict_t *dict = NULL;
int32_t interval = -1;
@@ -3433,7 +3433,8 @@ glusterd_transport_inet_options_build (dict_t **options, const char *hostname,
port = GLUSTERD_DEFAULT_PORT;
/* Build default transport options */
- ret = rpc_transport_inet_options_build (&dict, hostname, port);
+ ret = rpc_transport_inet_options_build (&dict, hostname, port,
+ addr_family);
if (ret)
goto out;
@@ -3470,6 +3471,7 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
int ret = -1;
glusterd_peerctx_t *peerctx = NULL;
data_t *data = NULL;
+ char *addr_family = NULL;
peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t);
if (!peerctx)
@@ -3485,9 +3487,15 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
uniquely identify a
peerinfo */
+ if (dict_get_str(this->options, "transport.address-family",
+ &addr_family)) {
+ addr_family = NULL;
+ }
+
ret = glusterd_transport_inet_options_build (&options,
peerinfo->hostname,
- peerinfo->port);
+ peerinfo->port,
+ addr_family);
if (ret)
goto out;
@@ -5157,11 +5165,16 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
this = THIS;
conf = this->private;
- if (RPC_CLNT_DESTROY == event) {
+ switch (event) {
+ case RPC_CLNT_DESTROY:
GF_FREE (peerctx->errstr);
GF_FREE (peerctx->peername);
GF_FREE (peerctx);
return 0;
+ case RPC_CLNT_PING:
+ return 0;
+ default:
+ break;
}
rcu_read_lock ();
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
index 0ea66a027bf..4fdff3402f5 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -241,6 +241,50 @@ build_volfile_path (char *volume_id, char *path,
}
+ volid_ptr = strstr (volume_id, "gfproxy-client/");
+ if (volid_ptr) {
+ volid_ptr = strchr (volid_ptr, '/');
+ if (!volid_ptr) {
+ ret = -1;
+ goto out;
+ }
+ volid_ptr++;
+
+ ret = glusterd_volinfo_find (volid_ptr, &volinfo);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't find volinfo");
+ goto out;
+ }
+
+ glusterd_get_gfproxy_client_volfile (volinfo, path, path_len);
+
+ ret = 0;
+ goto out;
+ }
+
+ volid_ptr = strstr (volume_id, "gfproxy/");
+ if (volid_ptr) {
+ volid_ptr = strchr (volid_ptr, '/');
+ if (!volid_ptr) {
+ ret = -1;
+ goto out;
+ }
+ volid_ptr++;
+
+ ret = glusterd_volinfo_find (volid_ptr, &volinfo);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't find volinfo");
+ goto out;
+ }
+
+ glusterd_get_gfproxyd_volfile (volinfo, path, path_len);
+
+ ret = 0;
+ goto out;
+ }
+
volid_ptr = strstr (volume_id, "/snaps/");
if (volid_ptr) {
ret = get_snap_volname_and_volinfo (volid_ptr, &volname,
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index c7100cab70b..e303937579e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1791,6 +1791,7 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
int port = 0;
int rdma_port = 0;
char *bind_address = NULL;
+ char *addr_family = NULL;
char socketpath[PATH_MAX] = {0};
char glusterd_uuid[1024] = {0,};
char valgrind_logfile[PATH_MAX] = {0};
@@ -1913,6 +1914,13 @@ retry:
bind_address);
}
+ if (dict_get_str (this->options, "transport.address-family",
+ &addr_family) == 0) {
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf (&runner, "*.transport.address-family=%s",
+ addr_family);
+ }
+
if (volinfo->transport_type == GF_TRANSPORT_RDMA)
runner_argprintf (&runner, "--volfile-server-transport=rdma");
else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA)
@@ -10791,6 +10799,45 @@ out:
}
void
+glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len)
+{
+ char workdir[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv);
+
+ switch (volinfo->transport_type) {
+ case GF_TRANSPORT_TCP:
+ snprintf (path, path_len,
+ "%s/trusted-%s.tcp-gfproxy-fuse.vol",
+ workdir, volinfo->volname);
+ break;
+
+ case GF_TRANSPORT_RDMA:
+ snprintf (path, path_len,
+ "%s/trusted-%s.rdma-gfproxy-fuse.vol",
+ workdir, volinfo->volname);
+ break;
+ default:
+ break;
+ }
+}
+
+void
+glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len)
+{
+ char workdir[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv);
+
+ snprintf (path, path_len, "%s/%s.gfproxyd.vol", workdir,
+ volinfo->volname);
+}
+
+void
glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,
char *path, int path_len)
{
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index f4c4138829f..7445407c010 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -642,6 +642,14 @@ void
glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,
char *path, int path_len);
+void
+glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len);
+
+void
+glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo,
+ char *path, int path_len);
+
int32_t
glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo,
glusterd_brickinfo_t *dup_brickinfo);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 25fb23f72b2..1f087b43ab4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -58,6 +58,20 @@ extern struct volopt_map_entry glusterd_volopt_map[];
} \
} while (0 /* CONSTCOND */)
+/**
+ * Needed for GFProxy
+ */
+#define GF_PROXY_DAEMON_PORT 40000
+#define GF_PROXY_DAEMON_PORT_STR "40000"
+
+static int
+volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+ dict_t *set_dict, void *param);
+
+static int
+build_client_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+ dict_t *mod_dict);
+
/*********************************************
*
* xlator generation / graph manipulation API
@@ -1448,6 +1462,75 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
static int
+gfproxy_server_graph_builder (volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ dict_t *set_dict, void *param)
+{
+ xlator_t *xl = NULL;
+ char *value = NULL;
+ char transt[16] = {0, };
+ char key[1024] = {0, };
+ char port_str[7] = {0, };
+ int ret = 0;
+ char *username = NULL;
+ char *password = NULL;
+ int rclusters = 0;
+
+ /* We are a trusted client */
+ ret = dict_set_uint32 (set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+ if (ret != 0)
+ goto out;
+
+ ret = dict_set_str (set_dict, "gfproxy-server", "on");
+ if (ret != 0)
+ goto out;
+
+ /* Build the client section of the graph first */
+ build_client_graph (graph, volinfo, set_dict);
+
+ /* Clear this setting so that future users of set_dict do not end up
+ * thinking they are a gfproxy server */
+ dict_del (set_dict, "gfproxy-server");
+ dict_del (set_dict, "trusted-client");
+
+ /* Then add the server to it */
+ get_vol_transport_type (volinfo, transt);
+ xl = volgen_graph_add (graph, "protocol/server", volinfo->volname);
+ if (!xl)
+ goto out;
+
+ ret = xlator_set_option (xl, "listen-port", GF_PROXY_DAEMON_PORT_STR);
+ if (ret != 0)
+ goto out;
+
+ ret = xlator_set_option (xl, "transport-type", transt);
+ if (ret != 0)
+ goto out;
+
+ /* Set username and password */
+ username = glusterd_auth_get_username (volinfo);
+ password = glusterd_auth_get_password (volinfo);
+ if (username) {
+ snprintf (key, sizeof (key), "auth.login.%s-server.allow",
+ volinfo->volname);
+ ret = xlator_set_option (xl, key, username);
+ if (ret)
+ return -1;
+ }
+
+ if (password) {
+ snprintf (key, sizeof (key), "auth.login.%s.password",
+ username);
+ ret = xlator_set_option (xl, key, password);
+ if (ret != 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int
brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
{
@@ -2541,6 +2624,48 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
}
static int
+gfproxy_server_perfxl_option_handler (volgen_graph_t *graph,
+ struct volopt_map_entry *vme,
+ void *param)
+{
+ gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (param);
+ volinfo = param;
+
+ /* write-behind is the *not* allowed for gfproxy-servers */
+ if (strstr (vme->key, "write-behind")) {
+ return 0;
+ }
+
+ perfxl_option_handler (graph, vme, param);
+
+ return 0;
+}
+
+static int
+gfproxy_client_perfxl_option_handler (volgen_graph_t *graph,
+ struct volopt_map_entry *vme,
+ void *param)
+{
+ gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (param);
+ volinfo = param;
+
+ /* write-behind is the only allowed "perf" for gfproxy-clients */
+ if (!strstr (vme->key, "write-behind"))
+ return 0;
+
+ perfxl_option_handler (graph, vme, param);
+
+ return 0;
+}
+
+
+static int
nfsperfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
void *param)
{
@@ -2768,8 +2893,10 @@ _free_xlator_opt_key (char *key)
}
static xlator_t *
-volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
- char *hostname, char *subvol, char *xl_id,
+volgen_graph_build_client (volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ char *hostname, char *port,
+ char *subvol, char *xl_id,
char *transt, dict_t *set_dict)
{
xlator_t *xl = NULL;
@@ -2801,6 +2928,12 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
goto err;
}
+ if (port) {
+ ret = xlator_set_option (xl, "remote-port", port);
+ if (ret)
+ goto err;
+ }
+
ret = xlator_set_option (xl, "remote-subvolume", subvol);
if (ret)
goto err;
@@ -2824,7 +2957,8 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
ret = dict_get_uint32 (set_dict, "trusted-client",
&client_type);
- if (!ret && client_type == GF_CLIENT_TRUSTED) {
+ if (!ret && (client_type == GF_CLIENT_TRUSTED
+ || client_type == GF_CLIENT_TRUSTED_PROXY)) {
str = NULL;
str = glusterd_auth_get_username (volinfo);
if (str) {
@@ -2911,7 +3045,9 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
i = 0;
cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {
xl = volgen_graph_build_client (graph, volinfo,
- brick->hostname, brick->path,
+ brick->hostname,
+ NULL,
+ brick->path,
brick->brick_id,
transt, set_dict);
if (!xl) {
@@ -3143,8 +3279,9 @@ volgen_graph_build_snapview_client (volgen_graph_t *graph,
get_transport_type (volinfo, set_dict, transt, _gf_false);
- prot_clnt = volgen_graph_build_client (graph, volinfo, NULL, subvol,
- xl_id, transt, set_dict);
+ prot_clnt = volgen_graph_build_client (graph, volinfo,
+ NULL, NULL, subvol,
+ xl_id, transt, set_dict);
if (!prot_clnt) {
ret = -1;
goto out;
@@ -3555,6 +3692,27 @@ static int client_graph_set_perf_options(volgen_graph_t *graph,
{
data_t *tmp_data = NULL;
char *volname = NULL;
+ int ret = 0;
+
+ /*
+ * Logic to make sure gfproxy-client gets custom performance translators
+ */
+ ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0);
+ if (ret == 1) {
+ return volgen_graph_set_options_generic (
+ graph, set_dict, volinfo,
+ &gfproxy_client_perfxl_option_handler);
+ }
+
+ /*
+ * Logic to make sure gfproxy-server gets custom performance translators
+ */
+ ret = dict_get_str_boolean (set_dict, "gfproxy-server", 0);
+ if (ret == 1) {
+ return volgen_graph_set_options_generic (
+ graph, set_dict, volinfo,
+ &gfproxy_server_perfxl_option_handler);
+ }
/*
* Logic to make sure NFS doesn't have performance translators by
@@ -3768,29 +3926,55 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
char *volname = NULL;
glusterd_conf_t *conf = THIS->private;
char *tmp = NULL;
+ char *hostname = NULL;
gf_boolean_t var = _gf_false;
gf_boolean_t ob = _gf_false;
+ gf_boolean_t is_gfproxy = _gf_false;
int uss_enabled = -1;
xlator_t *this = THIS;
+ char *subvol = NULL;
+ size_t subvol_namelen = 0;
GF_ASSERT (this);
GF_ASSERT (conf);
- volname = volinfo->volname;
- ret = volgen_graph_build_clients (graph, volinfo, set_dict,
- param);
- if (ret)
+ ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0);
+ if (ret == -1)
goto out;
- if (volinfo->type == GF_CLUSTER_TYPE_TIER)
- ret = volume_volgen_graph_build_clusters_tier
- (graph, volinfo, _gf_false);
- else
- ret = volume_volgen_graph_build_clusters
- (graph, volinfo, _gf_false);
+ volname = volinfo->volname;
+ if (ret == 0) {
+ ret = volgen_graph_build_clients (graph, volinfo, set_dict,
+ param);
+ if (ret)
+ goto out;
- if (ret == -1)
- goto out;
+ if (volinfo->type == GF_CLUSTER_TYPE_TIER)
+ ret = volume_volgen_graph_build_clusters_tier
+ (graph, volinfo, _gf_false);
+ else
+ ret = volume_volgen_graph_build_clusters
+ (graph, volinfo, _gf_false);
+
+ if (ret == -1)
+ goto out;
+ } else {
+ is_gfproxy = _gf_true;
+ ret = dict_get_str (set_dict,
+ "config.gfproxyd-remote-host", &tmp);
+ if (ret == -1)
+ goto out;
+
+ subvol_namelen = strlen (volinfo->volname) +
+ strlen ("-server") + 1;
+ subvol = alloca (subvol_namelen);
+ snprintf (subvol, subvol_namelen,
+ "%s-server", volinfo->volname);
+
+ volgen_graph_build_client (graph, volinfo, tmp,
+ GF_PROXY_DAEMON_PORT_STR, subvol,
+ "gfproxy", "tcp", set_dict);
+ }
ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false);
if (ret == -1)
@@ -3851,6 +4035,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
+ /* gfproxy needs the AHA translator */
+ if (is_gfproxy) {
+ xl = volgen_graph_add (graph, "cluster/aha", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+
if (conf->op_version == GD_OP_VERSION_MIN) {
ret = glusterd_volinfo_get_boolean (volinfo,
VKEY_FEATURES_QUOTA);
@@ -4300,6 +4493,19 @@ nfs_option_handler (volgen_graph_t *graph,
return -1;
}
+ if (! strcmp (vme->option, "!nfs.*.exports-auth-enable")) {
+ ret = gf_asprintf (&aa, "nfs.%s.exports-auth-enable",
+ volinfo->volname);
+
+ if (ret != -1) {
+ ret = xlator_set_option (xl, aa, vme->value);
+ GF_FREE (aa);
+ }
+
+ if (ret)
+ return -1;
+ }
+
if ((strcmp (vme->voltype, "nfs/server") == 0) &&
(vme->option && vme->option[0]!='!') ) {
ret = xlator_set_option (xl, vme->option, vme->value);
@@ -4348,8 +4554,12 @@ volgen_get_shd_key (int type)
static gf_boolean_t
volgen_is_shd_compatible_xl (char *xl_type)
{
- char *shd_xls[] = {"cluster/replicate", "cluster/disperse",
- NULL};
+ char *shd_xls[] = {
+ "cluster/replicate",
+ "cluster/disperse",
+ "debug/io-stats",
+ NULL
+ };
if (gf_get_index_by_elem (shd_xls, xl_type) != -1)
return _gf_true;
@@ -4731,6 +4941,24 @@ out:
return ret;
}
+static int
+volgen_graph_set_iam_nfsd (const volgen_graph_t *graph)
+{
+ xlator_t *trav;
+ int ret = 0;
+
+ for (trav = first_of ((volgen_graph_t *)graph); trav;
+ trav = trav->next) {
+ if (strcmp (trav->type, "cluster/replicate") != 0)
+ continue;
+
+ ret = xlator_set_option (trav, "iam-nfs-daemon", "yes");
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
/* builds a graph for nfs server role, with option overrides in mod_dict */
int
build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
@@ -4869,6 +5097,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
if (ret)
goto out;
+ ret = volgen_graph_set_iam_nfsd (&cgraph);
+ if (ret)
+ goto out;
+
ret = volgen_graph_merge_sub (graph, &cgraph, 1);
if (ret)
goto out;
@@ -4930,6 +5162,22 @@ get_brick_filepath (char *filename, glusterd_volinfo_t *volinfo,
brickinfo->hostname, brick);
}
+static void
+get_gfproxyd_filepath (char *filename, glusterd_volinfo_t *volinfo)
+{
+ char path[PATH_MAX] = {0, };
+ char brick[PATH_MAX] = {0, };
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+
+ snprintf (filename, PATH_MAX,
+ "%s/%s.gfproxyd.vol", path,
+ volinfo->volname);
+}
+
gf_boolean_t
glusterd_is_valid_volfpath (char *volname, char *brick)
{
@@ -4975,6 +5223,32 @@ out:
}
static int
+glusterd_generate_gfproxyd_volfile (glusterd_volinfo_t *volinfo)
+{
+ volgen_graph_t graph = {0, };
+ char filename[PATH_MAX] = {0, };
+ int ret = -1;
+
+ GF_ASSERT (volinfo);
+
+ get_gfproxyd_filepath (filename, volinfo);
+
+ struct glusterd_gfproxyd_info info = {
+ .port = GF_PROXY_DAEMON_PORT,
+ };
+
+ ret = build_graph_generic (&graph, volinfo,
+ NULL, &info,
+ &gfproxy_server_graph_builder);
+ if (ret == 0)
+ ret = volgen_write_volfile (&graph, filename);
+
+ volgen_graph_free (&graph);
+
+ return ret;
+}
+
+static int
glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
dict_t *mod_dict, void *data)
@@ -5245,7 +5519,8 @@ glusterd_generate_client_per_brick_volfile (glusterd_volinfo_t *volinfo)
cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {
xl = volgen_graph_build_client (&graph, volinfo,
- brick->hostname, brick->path,
+ brick->hostname,
+ NULL, brick->path,
brick->brick_id,
"tcp", dict);
if (!xl) {
@@ -5376,6 +5651,11 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo,
ret = glusterd_get_trusted_client_filepath (filepath,
volinfo,
type);
+ } else if (client_type == GF_CLIENT_TRUSTED_PROXY) {
+ glusterd_get_gfproxy_client_volfile (volinfo,
+ filepath,
+ PATH_MAX);
+ ret = dict_set_str (dict, "gfproxy-client", "on");
} else {
ret = glusterd_get_client_filepath (filepath,
volinfo,
@@ -5620,6 +5900,7 @@ build_bitd_volume_graph (volgen_graph_t *graph,
xl = volgen_graph_build_client (&cgraph, volinfo,
brickinfo->hostname,
+ NULL,
brickinfo->path,
brickinfo->brick_id,
transt, set_dict);
@@ -5782,6 +6063,7 @@ build_scrub_volume_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
xl = volgen_graph_build_client (&cgraph, volinfo,
brickinfo->hostname,
+ NULL,
brickinfo->path,
brickinfo->brick_id,
transt, set_dict);
@@ -5913,12 +6195,25 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo)
goto out;
}
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED_PROXY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not generate gfproxy client volfiles");
+ goto out;
+ }
+
ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER);
if (ret)
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_VOLFILE_CREATE_FAIL,
"Could not generate client volfiles");
+
+ ret = glusterd_generate_gfproxyd_volfile (volinfo);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not generate gfproxy volfiles");
+
out:
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index f90177372dc..cb2cad50efc 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -52,7 +52,8 @@
typedef enum {
GF_CLIENT_TRUSTED,
- GF_CLIENT_OTHER
+ GF_CLIENT_OTHER,
+ GF_CLIENT_TRUSTED_PROXY,
} glusterd_client_type_t;
struct volgen_graph {
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index 62de6b31b64..8f2a23a898a 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -286,6 +286,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)
int32_t type = 0;
char *username = NULL;
char *password = NULL;
+#ifdef IPV6_DEFAULT
+ char *addr_family = "inet6";
+#else
+ char *addr_family = "inet";
+#endif
GF_ASSERT (req);
@@ -388,10 +393,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)
/* Setting default as inet for trans_type tcp */
ret = dict_set_dynstr_with_alloc (dict,
"transport.address-family",
- "inet");
+ addr_family);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "failed to set transport.address-family");
+ "failed to set transport.address-family "
+ "to %s", addr_family);
goto out;
}
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 1e24adabe0c..d29f32d1963 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -11,6 +11,7 @@
#include "glusterd-volgen.h"
#include "glusterd-utils.h"
+#if USE_GFDB
static int
get_tier_freq_threshold (glusterd_volinfo_t *volinfo, char *threshold_key) {
int threshold = 0;
@@ -473,6 +474,7 @@ out:
return ret;
}
+#endif
static int
validate_cache_max_min_size (glusterd_volinfo_t *volinfo, dict_t *dict,
@@ -1048,6 +1050,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 1,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.min-free-strict-mode",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "cluster.min-free-inodes",
.voltype = "cluster/distribute",
.op_version = 1,
@@ -1113,6 +1120,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.flags = OPT_FLAG_CLIENT_OPT,
},
+ { .key = "cluster.du-refresh-interval-sec",
+ .voltype = "cluster/distribute",
+ .option = "du-refresh-interval-sec",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
/* NUFA xlator options (Distribute special case) */
{ .key = "cluster.nufa",
.voltype = "cluster/distribute",
@@ -1299,6 +1313,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_7_12,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.pgfid-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .description = "Use PGFID attribute if available to remediate "
+ "failed heals."
+ },
/* stripe xlator options */
{ .key = "cluster.stripe-block-size",
@@ -1454,6 +1475,18 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 1,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "performance.statfs-cache",
+ .voltype = "performance/io-cache",
+ .option = "statfs-cache",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.statfs-cache-timeout",
+ .voltype = "performance/io-cache",
+ .option = "statfs-cache-timeout",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
/* IO-threads xlator options */
{ .key = "performance.io-thread-count",
@@ -1461,6 +1494,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "thread-count",
.op_version = 1
},
+ { .key = "performance.io-thread-fops-per-thread-ratio",
+ .voltype = "performance/io-threads",
+ .option = "fops-per-thread-ratio",
+ .op_version = 1
+ },
{ .key = "performance.high-prio-threads",
.voltype = "performance/io-threads",
.op_version = 1
@@ -1555,6 +1593,18 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 2,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "performance.write-behind-trickling-writes",
+ .voltype = "performance/write-behind",
+ .option = "trickling-writes",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.nfs.write-behind-trickling-writes",
+ .voltype = "performance/write-behind",
+ .option = "trickling-writes",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "performance.lazy-open",
.voltype = "performance/open-behind",
.option = "lazy-open",
@@ -2403,8 +2453,8 @@ struct volopt_map_entry glusterd_volopt_map[] = {
/* Cli options for Export authentication on nfs mount */
{ .key = "nfs.exports-auth-enable",
.voltype = "nfs/server",
- .option = "nfs.exports-auth-enable",
- .type = GLOBAL_DOC,
+ .option = "!nfs.*.exports-auth-enable",
+ //.type = GLOBAL_DOC,
.op_version = GD_OP_VERSION_3_7_0
},
{ .key = "nfs.auth-refresh-interval-sec",
@@ -2500,6 +2550,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.voltype = "storage/posix",
.op_version = GD_OP_VERSION_3_6_0,
},
+ { .key = "storage.min-free-disk",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
+ { .key = "storage.freespace-check-interval",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
{ .key = "storage.bd-aio",
.voltype = "storage/bd",
.op_version = 3
@@ -2515,6 +2573,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "!config",
.op_version = 2
},
+ { .key = "config.gfproxyd-remote-host",
+ .voltype = "configuration",
+ .option = "gfproxyd-remote-host",
+ .op_version = 2
+ },
{ .key = GLUSTERD_QUORUM_TYPE_KEY,
.voltype = "mgmt/glusterd",
.value = "off",
@@ -2961,7 +3024,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {
{ .key = "cluster.locking-scheme",
.voltype = "cluster/replicate",
.type = DOC,
- .op_version = GD_OP_VERSION_3_7_12,
+ .op_version = GD_OP_VERSION_3_7_12 ,
.flags = OPT_FLAG_CLIENT_OPT
},
{ .key = "cluster.granular-entry-heal",
@@ -2970,6 +3033,72 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_8_0,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .option = "revocation-secs",
+ .key = "features.locks-revocation-secs",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "revocation-clear-all",
+ .key = "features.locks-revocation-clear-all",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "revocation-max-blocked",
+ .key = "features.locks-revocation-max-blocked",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ },
+ { .option = "monkey-unlocking",
+ .key = "features.locks-monkey-unlocking",
+ .voltype = "features/locks",
+ .op_version = GD_OP_VERSION_3_6_0,
+ .type = NO_DOC,
+ },
+ { .key = "cluster.halo-enabled",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-hybrid-mode",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-failover-enabled",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-shd-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-nfsd-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-max-latency",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-max-replicas",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-min-replicas",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.halo-min-samples",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index 5bdf2ad0d4b..7c59d5501a9 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -222,6 +222,11 @@ struct glusterd_brickinfo {
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
+struct glusterd_gfproxyd_info {
+ short port;
+ char *logfile;
+};
+
struct gf_defrag_brickinfo_ {
char *name;
int files;
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 6c4cdfed062..598f62fee7a 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -186,6 +186,25 @@ start_glusterfs ()
fi
#options with values start here
+ if [ -n "$halo_failover_enabled" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-failover-enabled=$halo_failover_enabled");
+ fi
+ if [ -n "$halo_max_latency" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-latency=$halo_max_latency");
+ fi
+
+ if [ -n "$halo_max_replicas" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-replicas=$halo_max_replicas");
+ fi
+
+ if [ -n "$halo_min_replicas" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-min-replicas=$halo_min_replicas");
+ fi
+
if [ -n "$log_level" ]; then
cmd_line=$(echo "$cmd_line --log-level=$log_level");
fi
@@ -479,6 +498,18 @@ with_options()
[ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
fuse_mountopts="${fuse_mountopts}$key=\"$value\""
;;
+ "halo-max-latency")
+ halo_max_latency=$value
+ ;;
+ "halo-max-replicas")
+ halo_max_replicas=$value
+ ;;
+ "halo-min-replicas")
+ halo_min_replicas=$value
+ ;;
+ "halo-failover-enabled")
+ halo_failover_enabled=$value
+ ;;
x-*)
# comments or userspace application-specific options, drop them
;;
diff --git a/xlators/nfs/server/src/auth-cache.c b/xlators/nfs/server/src/auth-cache.c
index 730e0a97d20..a607502c9de 100644
--- a/xlators/nfs/server/src/auth-cache.c
+++ b/xlators/nfs/server/src/auth-cache.c
@@ -17,47 +17,28 @@
#include "exports.h"
#include "nfs-messages.h"
-enum auth_cache_lookup_results {
- ENTRY_FOUND = 0,
- ENTRY_NOT_FOUND = -1,
- ENTRY_EXPIRED = -2,
-};
-
-struct auth_cache_entry {
- GF_REF_DECL; /* refcounting */
- data_t *data; /* data_unref() on refcount == 0 */
-
- time_t timestamp;
- struct export_item *item;
-};
-
/* Given a filehandle and an ip, creates a colon delimited hashkey.
*/
-static char*
-make_hashkey(struct nfs3_fh *fh, const char *host)
-{
- char *hashkey = NULL;
- char exportid[256] = {0, };
- char gfid[256] = {0, };
- char mountid[256] = {0, };
- size_t nbytes = 0;
-
- gf_uuid_unparse (fh->exportid, exportid);
- gf_uuid_unparse (fh->gfid, gfid);
- gf_uuid_unparse (fh->mountid, mountid);
-
- nbytes = strlen (exportid) + strlen (host)
- + strlen (mountid) + 3;
- hashkey = GF_MALLOC (nbytes, gf_common_mt_char);
- if (!hashkey)
- return NULL;
-
- snprintf (hashkey, nbytes, "%s:%s:%s", exportid,
- mountid, host);
-
- return hashkey;
-}
-
+#define make_fh_hashkey(hashkey, fh, host) \
+ do { \
+ char exportid[256] = {0, }; \
+ char mountid[256] = {0, }; \
+ size_t nbytes = 0; \
+ gf_uuid_unparse (fh->exportid, exportid); \
+ gf_uuid_unparse (fh->mountid, mountid); \
+ nbytes = strlen (exportid) + strlen (host) \
+ + strlen (mountid) + 5; \
+ hashkey = alloca (nbytes); \
+ snprintf (hashkey, nbytes, "%s:%s:%s", exportid, \
+ mountid, host); \
+ } while (0); \
+
+#define make_path_hashkey(hashkey, path, host) \
+ do { \
+ size_t nbytes = strlen (path) + strlen (host) + 2; \
+ hashkey = alloca (nbytes); \
+ snprintf (hashkey, nbytes, "%s:%s", path, host); \
+ } while (0);
/**
* auth_cache_init -- Initialize an auth cache and set the ttl_sec
*
@@ -86,28 +67,11 @@ out:
return cache;
}
-/* auth_cache_entry_free -- called by refcounting subsystem on refcount == 0
- *
- * @to_free: auth_cache_entry that has refcount == 0 and needs to get free'd
- */
-void
-auth_cache_entry_free (void *to_free)
-{
- struct auth_cache_entry *entry = to_free;
- data_t *entry_data = NULL;
-
- GF_VALIDATE_OR_GOTO (GF_NFS, entry, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, entry->data, out);
-
- entry_data = entry->data;
- /* set data_t->data to NULL, otherwise data_unref() tries to free it */
- entry_data->data = NULL;
- data_unref (entry_data);
-
- GF_FREE (entry);
-out:
- return;
-}
+struct auth_cache_entry {
+ time_t timestamp;
+ struct export_item *item;
+ gf_boolean_t access_allowed;
+};
/**
* auth_cache_entry_init -- Initialize an auth cache entry
@@ -124,303 +88,203 @@ auth_cache_entry_init ()
if (!entry)
gf_msg (GF_NFS, GF_LOG_WARNING, ENOMEM, NFS_MSG_NO_MEMORY,
"failed to allocate entry");
- else
- GF_REF_INIT (entry, auth_cache_entry_free);
return entry;
}
+// Internal lookup
+enum _internal_cache_lookup_results {
+ ENTRY_NOT_FOUND = -1,
+ ENTRY_EXPIRED = -2,
+};
+
/**
- * auth_cache_add -- Add an auth_cache_entry to the cache->dict
+ * auth_cache_purge -- Purge the dict in the cache and set
+ * the dict pointer to NULL. It will be allocated
+ * on the first insert into the dict.
+ *
+ * @cache: Cache to purge
*
- * @return: 0 on success, non-zero otherwise.
*/
-static int
-auth_cache_add (struct auth_cache *cache, char *hashkey,
- struct auth_cache_entry *entry)
+void
+auth_cache_purge (struct auth_cache *cache)
{
- int ret = -1;
- data_t *entry_data = NULL;
-
- GF_VALIDATE_OR_GOTO (GF_NFS, cache, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, cache->cache_dict, out);
-
- ret = GF_REF_GET (entry);
- if (ret == 0) {
- /* entry does not have any references */
- ret = -1;
- goto out;
- }
+ dict_t *new_cache_dict = NULL;
+ dict_t *old_cache_dict = cache->cache_dict;
- entry_data = bin_to_data (entry, sizeof (*entry));
- if (!entry_data) {
- ret = -1;
- GF_REF_PUT (entry);
+ if (!cache || !cache->cache_dict)
goto out;
- }
- /* we'll take an extra ref on the data_t, it gets unref'd when the
- * auth_cache_entry is released */
- entry->data = data_ref (entry_data);
+ (void)__sync_lock_test_and_set (&cache->cache_dict, new_cache_dict);
- LOCK (&cache->lock);
- {
- ret = dict_set (cache->cache_dict, hashkey, entry_data);
- }
- UNLOCK (&cache->lock);
-
- if (ret) {
- /* adding to dict failed */
- GF_REF_PUT (entry);
- }
+ dict_destroy (old_cache_dict);
out:
- return ret;
+ return;
}
-/**
- * _auth_cache_expired -- Check if the auth_cache_entry has expired
- *
- * The auth_cache->lock should have been taken when this function is called.
- *
- * @return: true when the auth_cache_entry is expired, false otherwise.
- */
-static int
-_auth_cache_expired (struct auth_cache *cache, struct auth_cache_entry *entry)
-{
- return ((time (NULL) - entry->timestamp) > cache->ttl_sec);
-}
/**
- * auth_cache_get -- Get the @hashkey entry from the cache->cache_dict
- *
- * @cache: The auth_cache that should contain the @entry.
- * @haskkey: The key associated with the auth_cache_entry.
- * @entry: The found auth_cache_entry, unmodified if not found/expired.
- *
- * The using the cache->dict requires locking, this function takes care of
- * that. When the entry is found, but has expired, it will be removed from the
- * cache_dict.
- *
- * @return: 0 when found, ENTRY_NOT_FOUND or ENTRY_EXPIRED otherwise.
+ * Lookup filehandle or path from the cache.
*/
-static enum auth_cache_lookup_results
-auth_cache_get (struct auth_cache *cache, char *hashkey,
- struct auth_cache_entry **entry)
+int _cache_lookup (struct auth_cache *cache, char *key,
+ struct auth_cache_entry **entry)
{
- enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND;
- data_t *entry_data = NULL;
- struct auth_cache_entry *lookup_res = NULL;
+ int ret = ENTRY_NOT_FOUND;
+ struct auth_cache_entry *lookup_res;
+ data_t *entry_data;
- GF_VALIDATE_OR_GOTO (GF_NFS, cache, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, cache->cache_dict, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, hashkey, out);
-
- LOCK (&cache->lock);
- {
- entry_data = dict_get (cache->cache_dict, hashkey);
- if (!entry_data)
- goto unlock;
-
- lookup_res = (struct auth_cache_entry *)(entry_data->data);
- if (GF_REF_GET (lookup_res) == 0) {
- /* entry has been free'd */
- ret = ENTRY_EXPIRED;
- goto unlock;
- }
+ if (!cache->cache_dict) {
+ goto out;
+ }
- if (_auth_cache_expired (cache, lookup_res)) {
- ret = ENTRY_EXPIRED;
+ if (!entry) {
+ goto out;
+ }
- /* free entry and remove from the cache */
- GF_FREE (lookup_res);
- entry_data->data = NULL;
- dict_del (cache->cache_dict, hashkey);
+ *entry = NULL;
- goto unlock;
- }
+ entry_data = dict_get (cache->cache_dict, key);
+ if (!entry_data) {
+ goto out;
+ }
- *entry = lookup_res;
- ret = ENTRY_FOUND;
+ lookup_res = (struct auth_cache_entry *)(entry_data->data);
+ if (time (NULL) - lookup_res->timestamp > cache->ttl_sec) {
+ GF_FREE (lookup_res);
+ entry_data->data = NULL;
+ dict_del (cache->cache_dict, key); // Remove from the cache
+ ret = ENTRY_EXPIRED;
+ goto out;
}
-unlock:
- UNLOCK (&cache->lock);
+
+ *entry = lookup_res;
+
+ return 0;
out:
- return ret;
+ return -1;
}
/**
- * auth_cache_lookup -- Lookup an item from the cache
- *
- * @cache: cache to lookup from
- * @fh : FH to use in lookup
- * @host_addr: Address to use in lookup
- * @timestamp: The timestamp to set when lookup succeeds
- * @can_write: Is the host authorized to write to the filehandle?
- *
- * If the current time - entry time of the cache entry > ttl_sec,
- * we remove the element from the dict and return ENTRY_EXPIRED.
- *
- * @return: ENTRY_EXPIRED if entry expired
- * ENTRY_NOT_FOUND if entry not found in dict
- * 0 if found
+ * Lookup filehandle from the cache.
*/
-enum auth_cache_lookup_results
-auth_cache_lookup (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr, time_t *timestamp,
- gf_boolean_t *can_write)
+int
+_cache_lookup_fh (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *host_addr, struct auth_cache_entry **ec)
{
- char *hashkey = NULL;
- struct auth_cache_entry *lookup_res = NULL;
- enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND;
-
- GF_VALIDATE_OR_GOTO (GF_NFS, cache, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, fh, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, timestamp, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, can_write, out);
-
- hashkey = make_hashkey (fh, host_addr);
- if (!hashkey) {
- ret = -ENOMEM;
- goto out;
+ char *hashkey;
+ int ret = ENTRY_NOT_FOUND;
+ if (fh && host_addr) {
+ make_fh_hashkey (hashkey, fh, host_addr);
+ ret =_cache_lookup (cache, hashkey, ec);
}
-
- ret = auth_cache_get (cache, hashkey, &lookup_res);
- switch (ret) {
- case ENTRY_FOUND:
- *timestamp = lookup_res->timestamp;
- *can_write = lookup_res->item->opts->rw;
- GF_REF_PUT (lookup_res);
- break;
-
- case ENTRY_NOT_FOUND:
- gf_msg_debug (GF_NFS, 0, "could not find entry for %s",
- host_addr);
- break;
-
- case ENTRY_EXPIRED:
- gf_msg_debug (GF_NFS, 0, "entry for host %s has expired",
- host_addr);
- break;
- }
-
-out:
- GF_FREE (hashkey);
-
return ret;
}
-/* auth_cache_entry_purge -- free up the auth_cache_entry
- *
- * This gets called through dict_foreach() by auth_cache_purge(). Each
- * auth_cache_entry has a refcount which needs to be decremented. Once the
- * auth_cache_entry reaches refcount == 0, auth_cache_entry_free() will call
- * data_unref() to free the associated data_t.
- *
- * @d: dict that gets purged by auth_cache_purge()
- * @k: hashkey of the current entry
- * @v: data_t of the current entry
+/**
+ * Lookup path from the cache.
*/
int
-auth_cache_entry_purge (dict_t *d, char *k, data_t *v, void *_unused)
+_cache_lookup_path (struct auth_cache *cache, const char *path,
+ const char *host_addr, struct auth_cache_entry **ec)
{
- struct auth_cache_entry *entry = (struct auth_cache_entry *) v->data;
-
- if (entry)
- GF_REF_PUT (entry);
-
- return 0;
+ char *hashkey;
+ int ret = ENTRY_NOT_FOUND;
+ if (path && host_addr) {
+ make_path_hashkey (hashkey, path, host_addr);
+ ret = _cache_lookup (cache, hashkey, ec);
+ }
+ return ret;
}
/**
- * auth_cache_purge -- Purge the dict in the cache and create a new empty one.
- *
- * @cache: Cache to purge
- *
+ * cache_item -- Caches either a filehandle or path.
+ * See descriptions of functions that invoke this one.
*/
-void
-auth_cache_purge (struct auth_cache *cache)
+int
+cache_item (struct auth_cache *cache, const char *path, struct nfs3_fh *fh,
+ const char *host_addr, struct export_item *export_item,
+ auth_cache_status_t status)
{
- dict_t *new_cache_dict = dict_new ();
- dict_t *old_cache_dict = NULL;
+ int ret = -EINVAL;
+ data_t *entry_data = NULL;
+ struct auth_cache_entry *entry = NULL;
+ char *hashkey = NULL;
- if (!cache || !new_cache_dict)
+ GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, cache, out);
+
+ // We can cache either a file-handle or a path, not both,
+ // and at least one of them must be defined!
+ if ((fh && path) || (!fh && !path)) {
goto out;
+ }
- LOCK (&cache->lock);
- {
- old_cache_dict = cache->cache_dict;
- cache->cache_dict = new_cache_dict;
+ // If a dict has not been allocated already, allocate it.
+ if (!cache->cache_dict) {
+ cache->cache_dict = dict_new ();
+ if (!cache->cache_dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- UNLOCK (&cache->lock);
- /* walk all entries and refcount-- with GF_REF_PUT() */
- dict_foreach (old_cache_dict, auth_cache_entry_purge, NULL);
- dict_unref (old_cache_dict);
-out:
- return;
-}
-/**
- * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given
- * host
- * @cache: The fh cache
- * @host_addr: Address to use in lookup
- * @fh: The fh to use in lookup
- *
- *
- * @return: TRUE if cached, FALSE otherwise
- *
- */
-gf_boolean_t
-is_nfs_fh_cached (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr)
-{
- int ret = 0;
- time_t timestamp = 0;
- gf_boolean_t cached = _gf_false;
- gf_boolean_t can_write = _gf_false;
+ // Find an entry with the filehandle or path, depending
+ // on which one is defined. Validation for these parameters
+ // is above.
+ if (fh) {
+ ret = _cache_lookup_fh (cache, fh, host_addr, &entry);
+ make_fh_hashkey (hashkey, fh, host_addr)
+ }
- if (!fh)
- goto out;
+ if (path) {
+ ret = _cache_lookup_path (cache, path, host_addr, &entry);
+ make_path_hashkey (hashkey, path, host_addr)
+ }
+
+ // If no entry was found, we need to create one.
+ if (!entry) {
+ entry = auth_cache_entry_init ();
+ GF_CHECK_ALLOC (entry, ret, out);
+ }
- ret = auth_cache_lookup (cache, fh, host_addr, &timestamp, &can_write);
- cached = (ret == ENTRY_FOUND);
+ // Populate the entry
+ entry->timestamp = time (NULL);
+ entry->item = export_item;
+ // Access is only allowed if the status is set to
+ // AUTH_CACHE_HOST_AUTH_OK
+ entry->access_allowed = (status == AUTH_CACHE_HOST_AUTH_OK);
+ // Put the entry into the cache
+ entry_data = bin_to_data (entry, sizeof (*entry));
+ dict_set (cache->cache_dict, hashkey, entry_data);
+ gf_log (GF_NFS, GF_LOG_TRACE, "Caching %s for host(%s) as %s",
+ path ? path : "fh", host_addr, entry->access_allowed ?
+ "ALLOWED" : "NOT ALLOWED");
out:
- return cached;
+ return ret;
}
-
/**
- * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given
- * host and writable
- * @cache: The fh cache
- * @host_addr: Address to use in lookup
- * @fh: The fh to use in lookup
- *
+ * cache_nfs_path -- Places the path in the underlying dict as we are
+ * using as our cache. The value is an entry struct
+ * containing the export item that was authorized or
+ * deauthorized for the operation and the path authorized
+ * or deauthorized.
*
- * @return: TRUE if cached & writable, FALSE otherwise
+ * @cache: The cache to place fh's in
+ * @path : The path to cache
+ * @host_addr: The address of the host
+ * @export_item: The export item that was authorized/deauthorized
*
*/
-gf_boolean_t
-is_nfs_fh_cached_and_writeable (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr)
+int
+cache_nfs_path (struct auth_cache *cache, const char *path,
+ const char *host_addr, struct export_item *export_item,
+ auth_cache_status_t status)
{
- int ret = 0;
- time_t timestamp = 0;
- gf_boolean_t cached = _gf_false;
- gf_boolean_t writable = _gf_false;
-
- if (!fh)
- goto out;
-
- ret = auth_cache_lookup (cache, fh, host_addr, &timestamp, &writable);
- cached = ((ret == ENTRY_FOUND) && writable);
-
-out:
- return cached;
+ return cache_item (cache, path, NULL, host_addr, export_item, status);
}
/**
@@ -438,52 +302,68 @@ out:
*/
int
cache_nfs_fh (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr, struct export_item *export_item)
+ const char *host_addr, struct export_item *export_item,
+ auth_cache_status_t status)
{
- int ret = -EINVAL;
- char *hashkey = NULL;
- data_t *entry_data = NULL;
- time_t timestamp = 0;
- gf_boolean_t can_write = _gf_false;
- struct auth_cache_entry *entry = NULL;
+ return cache_item (cache, NULL, fh, host_addr, export_item, status);
+}
- GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, cache, out);
- GF_VALIDATE_OR_GOTO (GF_NFS, fh, out);
+auth_cache_status_t
+auth_cache_allows (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *path, const char *host_addr,
+ gf_boolean_t check_rw_access)
+{
+ int ret = 0;
+ int status = AUTH_CACHE_HOST_EACCES;
+ gf_boolean_t cache_allows = FALSE;
+ struct auth_cache_entry *ace = NULL;
- /* If we could already find it in the cache, just return */
- ret = auth_cache_lookup (cache, fh, host_addr, &timestamp, &can_write);
- if (ret == 0) {
- gf_msg_trace (GF_NFS, 0, "found cached auth/fh for host "
- "%s", host_addr);
+ if ((fh && path) || (!fh && !path)) {
+ status = AUTH_CACHE_HOST_ENOENT;
goto out;
}
- hashkey = make_hashkey (fh, host_addr);
- if (!hashkey) {
- ret = -ENOMEM;
- goto out;
+ if (fh) {
+ ret = _cache_lookup_fh (cache, fh, host_addr, &ace);
}
- entry = auth_cache_entry_init ();
- if (!entry) {
- ret = -ENOMEM;
- goto out;
+ if (path) {
+ ret = _cache_lookup_path (cache, path, host_addr, &ace);
}
- entry->timestamp = time (NULL);
- entry->item = export_item;
-
- ret = auth_cache_add (cache, hashkey, entry);
- GF_REF_PUT (entry);
- if (ret)
- goto out;
+ cache_allows = (ret == 0) && ace->access_allowed;
+ if (check_rw_access) {
+ cache_allows = cache_allows && ace->item->opts->rw;
+ }
- gf_msg_trace (GF_NFS, 0, "Caching file-handle (%s)", host_addr);
- ret = 0;
+ if (!ace) {
+ status = AUTH_CACHE_HOST_ENOENT;
+ }
+ if (cache_allows) {
+ status = AUTH_CACHE_HOST_AUTH_OK;
+ }
out:
- GF_FREE (hashkey);
+ return status;
+}
- return ret;
+auth_cache_status_t
+auth_cache_allows_fh (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *host_addr)
+{
+ return auth_cache_allows (cache, fh, NULL, host_addr, FALSE);
+}
+
+auth_cache_status_t
+auth_cache_allows_write_to_fh (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *host_addr)
+{
+ return auth_cache_allows (cache, fh, NULL, host_addr, TRUE);
+}
+
+auth_cache_status_t
+auth_cache_allows_path (struct auth_cache *cache, const char *path,
+ const char *host_addr)
+{
+ return auth_cache_allows (cache, NULL, path, host_addr, FALSE);
}
diff --git a/xlators/nfs/server/src/auth-cache.h b/xlators/nfs/server/src/auth-cache.h
index a3ea5a43ded..de7db6b5545 100644
--- a/xlators/nfs/server/src/auth-cache.h
+++ b/xlators/nfs/server/src/auth-cache.h
@@ -27,6 +27,11 @@ struct auth_cache {
time_t ttl_sec; /* TTL of the auth cache in seconds */
};
+typedef enum {
+ AUTH_CACHE_HOST_ENOENT = -1, /* Host not found in cache */
+ AUTH_CACHE_HOST_EACCES = -2, /* Host explicitly de-authed */
+ AUTH_CACHE_HOST_AUTH_OK = 0, /* Host is fully authed */
+} auth_cache_status_t;
/* Initializes the cache */
struct auth_cache *
@@ -35,17 +40,29 @@ auth_cache_init (time_t ttl_sec);
/* Inserts FH into cache */
int
cache_nfs_fh (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr, struct export_item *export_item);
+ const char *host_addr, struct export_item *export_item,
+ auth_cache_status_t status);
+
+/* Inserts path into cache */
+int
+cache_nfs_path (struct auth_cache *cache, const char *path,
+ const char *host_addr, struct export_item *export_item,
+ auth_cache_status_t status);
/* Checks if the filehandle cached & writable */
-gf_boolean_t
-is_nfs_fh_cached_and_writeable (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr);
+auth_cache_status_t
+auth_cache_allows_write_to_fh (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *host_addr);
/* Checks if the filehandle is cached */
-gf_boolean_t
-is_nfs_fh_cached (struct auth_cache *cache, struct nfs3_fh *fh,
- const char *host_addr);
+auth_cache_status_t
+auth_cache_allows_fh (struct auth_cache *cache, struct nfs3_fh *fh,
+ const char *host_addr);
+
+/* Checks if the path is cached */
+auth_cache_status_t
+auth_cache_allows_path (struct auth_cache *cache, const char *path,
+ const char *host_addr);
/* Purge the cache */
void
diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h
index bc9af2f0b8b..a4e15d3f7ef 100644
--- a/xlators/nfs/server/src/exports.h
+++ b/xlators/nfs/server/src/exports.h
@@ -22,7 +22,7 @@
#define GF_EXP GF_NFS"-exports"
#define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())"
-#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)"
+#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)"
#define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)"
#define NETGROUP_MAX_LEN 128
@@ -51,23 +51,28 @@ struct export_options {
char *anon_uid; /* anonuid option */
char *sec_type; /* X, for sec=X */
};
+typedef struct export_options export_options_t;
+
struct export_item {
- char *name; /* Name of the export item */
- struct export_options *opts; /* NFS Options */
+ char *name; /* Name of the export item */
+ export_options_t *opts; /* NFS Options */
};
+typedef struct export_item export_item_t;
struct export_dir {
char *dir_name; /* Directory */
dict_t *netgroups; /* Dict of netgroups */
dict_t *hosts; /* Dict of hosts */
};
+typedef struct export_dir export_dir_t;
struct exports_file {
char *filename; /* Filename */
dict_t *exports_dict; /* Dict of export_dir_t */
dict_t *exports_map; /* Map of SuperFastHash(<export>) -> expdir */
};
+typedef struct exports_file exports_file_t;
void
exp_file_deinit (struct exports_file *expfile);
diff --git a/xlators/nfs/server/src/mount3-auth.c b/xlators/nfs/server/src/mount3-auth.c
index 97c95cbfd23..831d92edbef 100644
--- a/xlators/nfs/server/src/mount3-auth.c
+++ b/xlators/nfs/server/src/mount3-auth.c
@@ -429,6 +429,15 @@ __export_dir_lookup_netgroup (dict_t *dict, char *key, data_t *val,
GF_ASSERT ((*key == '@'));
+ /**
+ * If at any point in time as we search through the dictionaries,
+ * if we were marked as "Found", we should exit out immediately
+ * and not set anything else in this struct.
+ */
+ if (ngsa->found) {
+ goto out;
+ }
+
/* We use ++key here because keys start with '@' for ngs */
ngentry = ng_file_get_netgroup (nfile, (key + 1));
if (!ngentry) {
@@ -452,10 +461,6 @@ __export_dir_lookup_netgroup (dict_t *dict, char *key, data_t *val,
ngsa);
}
- /* If the above search was successful, just return */
- if (ngsa->found)
- goto out;
-
/* Run through the netgroups dict */
if (ngentry->netgroup_ngs) {
ngsa->_is_host_dict = _gf_false;
diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c
index b7350385c32..1cc0b07a9a6 100644
--- a/xlators/nfs/server/src/mount3.c
+++ b/xlators/nfs/server/src/mount3.c
@@ -24,6 +24,7 @@
#include "iatt.h"
#include "nfs-mem-types.h"
#include "nfs.h"
+#include "nfs3.h"
#include "common-utils.h"
#include "store.h"
#include "glfs-internal.h"
@@ -36,6 +37,7 @@
#include <sys/socket.h>
#include <sys/uio.h>
+#define SUPPORT_RMTAB 0
/* This macro will assist in freeing up entire link list
* of host_auth_spec structure.
@@ -444,7 +446,7 @@ mount_open_rmtab (const char *rmtab, gf_store_handle_t **sh)
return _gf_true;
}
-
+#if SUPPORT_RMTAB
/* Read the rmtab into a clean ms->mountlist.
*/
static void
@@ -472,6 +474,7 @@ mount_read_rmtab (struct mount3_state *ms)
out:
gf_store_handle_destroy (sh);
}
+#endif
/* Write the ms->mountlist to the rmtab.
*
@@ -597,7 +600,9 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
nfs = (struct nfs_state *)ms->nfsx->private;
+#if SUPPORT_RMTAB
update_rmtab = mount_open_rmtab (nfs->rmtab, &sh);
+#endif
strncpy (me->exname, expname, MNTPATHLEN);
/* Sometimes we don't care about the full path
@@ -696,6 +701,9 @@ __mnt3_build_mountid_from_path (const char *path, uuid_t mountid)
uint32_t hashed_path = 0;
int ret = -1;
+ if (!path)
+ goto out;
+
while (strlen (path) > 0 && path[0] == '/')
path++;
@@ -791,7 +799,9 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
}
snprintf (path, PATH_MAX, "/%s", mntxl->name);
+#if SUPPORT_RMTAB
mnt3svc_update_mountlist (ms, req, path, NULL);
+#endif
GF_FREE (path);
if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) {
fh = nfs3_fh_build_indexed_root_fh (ms->nfsx->children, mntxl);
@@ -1163,7 +1173,8 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mres->exp->expname, mres->resolveloc.path);
/* Check if this path is authorized to be mounted */
- authcode = mnt3_authenticate_request (ms, mres->req, NULL, NULL,
+ authcode = mnt3_authenticate_request (ms, mres->req, NULL,
+ mres->exp->vol->name,
mres->exp->fullpath,
&authorized_path,
&authorized_host,
@@ -1185,6 +1196,9 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
/* Build mountid from the authorized path and stick it in the
* filehandle that will get passed back to the client
*/
+ if (!authorized_path) {
+ goto err;
+ }
__mnt3_build_mountid_from_path (authorized_path, fh.mountid);
snprintf (path, PATH_MAX, "/%s%s", mres->exp->vol->name,
@@ -1741,7 +1755,7 @@ mnt3_check_client_net_udp (struct svc_req *req, char *volname, xlator_t *nfsx)
if ((!req) || (!volname) || (!nfsx))
goto err;
- sin = svc_getcaller (req->rq_xprt);
+ sin = (struct sockaddr_in *)svc_getcaller (req->rq_xprt);
if (!sin)
goto err;
@@ -1896,7 +1910,7 @@ _mnt3_get_host_from_peer (const char *peer_addr)
size_t host_len = 0;
char *colon = NULL;
- colon = strchr (peer_addr, ':');
+ colon = strrchr (peer_addr, ':');
if (!colon) {
gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER,
"Bad peer %s", peer_addr);
@@ -1925,9 +1939,23 @@ mnt3_check_cached_fh (struct mount3_state *ms, struct nfs3_fh *fh,
const char *host_addr, gf_boolean_t is_write_op)
{
if (!is_write_op)
- return is_nfs_fh_cached (ms->authcache, fh, host_addr);
+ return auth_cache_allows_fh (ms->authcache, fh, host_addr);
+
+ return auth_cache_allows_write_to_fh (ms->authcache, fh, host_addr);
+}
- return is_nfs_fh_cached_and_writeable (ms->authcache, fh, host_addr);
+/**
+ * mnt3_check_cached_path -- Check if path is cached.
+ *
+ * Calls auxiliary functions based on whether we are checking
+ * a write operation.
+ *
+ */
+int
+mnt3_check_cached_path (struct mount3_state *ms, const char *path,
+ const char *host_addr, gf_boolean_t is_write_op)
+{
+ return auth_cache_allows_path (ms->authcache, path, host_addr);
}
/**
@@ -1961,7 +1989,7 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req,
char *pathdup = NULL;
size_t dlen = 0;
char *auth_host = NULL;
- gf_boolean_t fh_cached = _gf_false;
+ auth_cache_status_t auth_cache_status = AUTH_CACHE_HOST_ENOENT;
struct export_item *expitem = NULL;
GF_VALIDATE_OR_GOTO (GF_MNT, ms, out);
@@ -1982,12 +2010,24 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req,
}
/* Check if the filehandle is cached */
- fh_cached = mnt3_check_cached_fh (ms, fh, host_addr_ip, is_write_op);
- if (fh_cached) {
- gf_msg_trace (GF_MNT, 0, "Found cached FH for %s",
- host_addr_ip);
+ auth_cache_status = fh ? mnt3_check_cached_fh (ms, fh, host_addr_ip,
+ is_write_op) :
+ mnt3_check_cached_path (ms, path, host_addr_ip,
+ is_write_op);
+
+ if (auth_cache_status == AUTH_CACHE_HOST_AUTH_OK) {
+ gf_log (GF_MNT, GF_LOG_TRACE, "Found authorized cached "
+ "FH for [%s]!", host_addr_ip);
auth_status_code = 0;
goto free_and_out;
+ } else if (auth_cache_status == AUTH_CACHE_HOST_EACCES) {
+ gf_log (GF_MNT, GF_LOG_TRACE, "Found de-authorized cached "
+ "FH for [%s]!", host_addr_ip);
+ auth_status_code = -EACCES;
+ goto free_and_out;
+ } else {
+ gf_log (GF_MNT, GF_LOG_TRACE, "Cached FH not found for [%s]!",
+ host_addr_ip);
}
/* Check if the IP is authorized */
@@ -2018,10 +2058,20 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req,
* host if they are null.
*/
if (!authorized_export || !authorized_host) {
- /* Cache the file handle if it was authorized */
- if (fh && auth_status_code == 0)
- cache_nfs_fh (ms->authcache, fh, host_addr_ip, expitem);
+ if (auth_status_code == 0) {
+ auth_cache_status = AUTH_CACHE_HOST_AUTH_OK;
+ } else {
+ auth_cache_status = AUTH_CACHE_HOST_EACCES;
+ }
+ if (fh) {
+ cache_nfs_fh (ms->authcache, fh, host_addr_ip,
+ expitem, auth_cache_status);
+ }
+ if (path) {
+ cache_nfs_path (ms->authcache, path, host_addr_ip,
+ expitem, auth_cache_status);
+ }
goto free_and_out;
}
@@ -2080,15 +2130,18 @@ mnt3_authenticate_request (struct mount3_state *ms, rpcsvc_request_t *req,
const char *path, char **authorized_path,
char **authorized_host, gf_boolean_t is_write_op)
{
- int auth_status_code = -EACCES;
- char *parent_path = NULL;
- const char *parent_old = NULL;
+ int auth_status_code = -EACCES;
+ char *parent_path = NULL;
+ const char *parent_old = NULL;
+ struct mnt3_export *exp = NULL;
+ struct nfs3_state *nfs3 = ms->nfs->nfs3state;
GF_VALIDATE_OR_GOTO (GF_MNT, ms, out);
GF_VALIDATE_OR_GOTO (GF_MNT, req, out);
+ GF_VALIDATE_OR_GOTO (GF_MNT, volname, out);
/* If this option is not set, just allow it through */
- if (!ms->nfs->exports_auth) {
+ if (!nfs3->exports_auth || !nfs3_is_exports_auth(nfs3, volname)) {
/* This function is called in a variety of use-cases (mount
* + each fop) so path/authorized_path are not always present.
* For the cases which it _is_ present we need to populate the
@@ -2213,8 +2266,8 @@ mnt3svc_mnt (rpcsvc_request_t *req)
/* The second authentication check is the exports/netgroups
* check.
*/
- authcode = mnt3_authenticate_request (ms, req, NULL, NULL, path, NULL,
- NULL, _gf_false);
+ authcode = mnt3_authenticate_request (ms, req, NULL, exp->vol->name,
+ path, NULL, NULL, FALSE);
if (authcode != 0) {
mntstat = MNT3ERR_ACCES;
gf_msg_debug (GF_MNT, 0, "Client mount not allowed");
@@ -2265,9 +2318,10 @@ __build_mountlist (struct mount3_state *ms, int *count)
if ((!ms) || (!count))
return NULL;
+#if SUPPORT_RMTAB
/* read rmtab, other peers might have updated it */
mount_read_rmtab(ms);
-
+#endif
*count = 0;
gf_msg_debug (GF_MNT, 0, "Building mount list:");
list_for_each_entry (me, &ms->mountlist, mlist) {
@@ -2399,7 +2453,9 @@ mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
nfs = (struct nfs_state *)ms->nfsx->private;
+#if SUPPORT_RMTAB
update_rmtab = mount_open_rmtab (nfs->rmtab, &sh);
+#endif
if (update_rmtab) {
ret = gf_store_lock (sh);
if (ret)
@@ -2818,7 +2874,8 @@ __mnt3udp_get_export_subdir_inode (struct svc_req *req, char *subdir,
/* AUTH check for subdir i.e. nfs.export-dir */
if (exp->hostspec) {
- struct sockaddr_in *sin = svc_getcaller (req->rq_xprt);
+ struct sockaddr_in *sin;
+ sin = (struct sockaddr_in *)svc_getcaller (req->rq_xprt);
ret = mnt3_verify_auth (sin, exp);
if (ret) {
gf_msg (GF_MNT, GF_LOG_ERROR, EACCES,
@@ -3026,7 +3083,9 @@ mount3udp_add_mountlist (xlator_t *nfsx, char *host, char *export)
LOCK (&ms->mountlock);
{
list_add_tail (&me->mlist, &ms->mountlist);
+#if SUPPORT_RMTAB
mount_rewrite_rmtab(ms, NULL);
+#endif
}
UNLOCK (&ms->mountlock);
return 0;
@@ -3714,6 +3773,9 @@ __mnt3_mounted_exports_walk (dict_t *dict, char *key, data_t *val, void *tmp)
* and umounts them.
*
* @ms: The mountstate for this service that holds all the information we need
+ if (!nfs->nfs3state)
+ return NULL;
+
*
*/
void
@@ -3800,6 +3862,9 @@ _mnt3_auth_param_refresh_thread (void *argv)
/* Sleep before checking the file again */
sleep (mstate->nfs->auth_refresh_time_secs);
+ if (!mstate->nfs->nfs3state->exports_auth)
+ continue;
+
if (_mnt3_has_file_changed (exp_file_path, &exp_time)) {
gf_msg (GF_MNT, GF_LOG_INFO, 0, NFS_MSG_UPDATING_EXP,
"File %s changed, updating exports,",
@@ -3978,7 +4043,7 @@ mnt3svc_init (xlator_t *nfsx)
goto err;
}
- if (nfs->exports_auth) {
+ if (nfs->nfs3state->exports_auth) {
ret = _mnt3_init_auth_params (mstate);
if (ret < 0)
goto err;
@@ -4127,6 +4192,15 @@ mnt1svc_init (xlator_t *nfsx)
}
}
+#ifdef IPV6_DEFAULT
+ ret = dict_set_str (options, "transport.address-family", "inet6");
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "dict_set_str error when trying to enable ipv6");
+ goto err;
+ }
+#endif
+
ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_msg (GF_NFS, GF_LOG_ERROR, errno,
diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c
index e8e226e953e..536a45ede3d 100644
--- a/xlators/nfs/server/src/mount3udp_svc.c
+++ b/xlators/nfs/server/src/mount3udp_svc.c
@@ -133,7 +133,15 @@ mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp)
mountres3 *res = NULL;
struct sockaddr_in *sin = NULL;
- sin = svc_getcaller (transp);
+ sin = (struct sockaddr_in *)svc_getcaller (transp);
+ /* svc_getcaller returns a pointer to a sockaddr_in6, even though it
+ * might actually be an IPv4 address. It ought return a struct sockaddr
+ * and make the caller upcast it to the proper address family. Sigh.
+ *
+ * Let's make sure that it's actually an IPv4 address.
+ */
+ GF_ASSERT (sin->sin_family == AF_INET);
+
inet_ntop (AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN+1);
switch (rqstp->rq_proc) {
diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c
index 526918872d7..fca38ba6b87 100644
--- a/xlators/nfs/server/src/nfs-common.c
+++ b/xlators/nfs/server/src/nfs-common.c
@@ -146,8 +146,12 @@ nfs_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)
gf_uuid_copy (loc->gfid, inode->gfid);
}
- if (parent)
+ if (parent) {
loc->parent = inode_ref (parent);
+ if (!gf_uuid_is_null (parent->gfid)) {
+ gf_uuid_copy (loc->pargfid, parent->gfid);
+ }
+ }
if (path) {
loc->path = gf_strdup (path);
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index ddfa89dab11..e94cb03b771 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -33,6 +33,7 @@
#include "syscall.h"
#include "rpcsvc.h"
#include "nfs-messages.h"
+#include "syncop.h"
#define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids"
#define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout"
@@ -204,6 +205,10 @@ nfs_program_register_portmap_all (struct nfs_state *nfs)
if (nfs->override_portnum)
prog->progport = nfs->override_portnum;
(void) rpcsvc_program_register_portmap (prog, prog->progport);
+#ifdef IPV6_DEFAULT
+ (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport,
+ TRUE);
+#endif
}
return (0);
@@ -285,6 +290,55 @@ nfs_deinit_versions (struct list_head *versions, xlator_t *this)
return 0;
}
+void rpcbind_register_prog (rpcsvc_program_t *prog)
+{
+ if (!prog) {
+ return;
+ }
+
+ /*
+ * Attempt to register the program with rpcbind. In 99.9% of cases,
+ * This call will most likely *always* fail, since the program should already
+ * be registered. We don't care if this call fails since it is best effort.
+ */
+ rpcsvc_program_register_portmap (prog, prog->progport);
+#ifdef IPV6_DEFAULT
+ rpcsvc_program_register_rpcbind6 (prog, prog->progport, FALSE);
+#endif
+}
+
+/**
+ * rpcbind_autoregister_task
+ *
+ * The purpose of this task is to attempt to ensure that NFS stays
+ * registered with rpcbind. The thread is "best effort", and as a
+ * result we do not care what the result of the call is.
+ */
+int rpcbind_autoregister_task (void *arg)
+{
+ struct nfs_state *nfs = arg;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+ struct list_head *versions = &nfs->versions;
+
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ rpcbind_register_prog (version->program);
+ }
+
+ return 0;
+}
+
+void *nfs_janitor (void *arg)
+{
+ struct nfs_state *nfs = arg;
+ while (_gf_true) {
+ synctask_new (nfs->this->ctx->env, rpcbind_autoregister_task,
+ NULL, NULL, nfs);
+ sleep (10);
+ }
+}
+
int
nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
{
@@ -339,6 +393,18 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
if (version->required)
goto err;
}
+#ifdef IPV6_DEFAULT
+ ret = rpcsvc_program_register_rpcbind6 (prog,
+ prog->progport,
+ TRUE);
+ if (ret == -1) {
+ gf_msg (GF_NFS, GF_LOG_ERROR, 0,
+ NFS_MSG_PGM_REG_FAIL,
+ "Program (ipv6) %s registration failed",
+ prog->progname);
+ goto err;
+ }
+#endif
}
}
@@ -348,6 +414,18 @@ err:
return ret;
}
+int
+nfs_janitor_init (struct nfs_state *nfs)
+{
+ int ret = pthread_create (&nfs->janitor_thread, NULL, nfs_janitor, nfs);
+ if (ret != 0) {
+ gf_log (GF_NFS, GF_LOG_WARNING,
+ "Unable to start rpcbind register thread! Error=%s",
+ strerror (ret));
+ return -1;
+ }
+ return 0;
+}
int
nfs_add_all_initiators (struct nfs_state *nfs)
@@ -355,24 +433,24 @@ nfs_add_all_initiators (struct nfs_state *nfs)
int ret = 0;
/* Add the initializers for all versions. */
- ret = nfs_add_initer (&nfs->versions, mnt3svc_init, _gf_true);
+ ret = nfs_add_initer (&nfs->versions, mnt1svc_init, _gf_true);
if (ret == -1) {
gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
- "Failed to add MOUNT3 protocol initializer");
+ "Failed to add MOUNT1 protocol initializer");
goto ret;
}
- ret = nfs_add_initer (&nfs->versions, mnt1svc_init, _gf_true);
+ ret = nfs_add_initer (&nfs->versions, nfs3svc_init, _gf_true);
if (ret == -1) {
gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
- "Failed to add MOUNT1 protocol initializer");
+ "Failed to add NFS3 protocol initializer");
goto ret;
}
- ret = nfs_add_initer (&nfs->versions, nfs3svc_init, _gf_true);
+ ret = nfs_add_initer (&nfs->versions, mnt3svc_init, _gf_true);
if (ret == -1) {
gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
- "Failed to add NFS3 protocol initializer");
+ "Failed to add MOUNT3 protocol initializer");
goto ret;
}
@@ -759,6 +837,8 @@ nfs_init_state (xlator_t *this)
return NULL;
}
+ nfs->this = this;
+
nfs->memfactor = GF_NFS_DEFAULT_MEMFACTOR;
if (dict_get (this->options, "nfs.mem-factor")) {
ret = dict_get_str (this->options, "nfs.mem-factor",
@@ -901,6 +981,16 @@ nfs_init_state (xlator_t *this)
}
}
+#ifdef IPV6_DEFAULT
+ ret = dict_set_str (this->options, "transport.address-family",
+ "inet6");
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error");
+ goto free_foppool;
+ }
+#endif
+
+
/* Right only socket support exists between nfs client and
* gluster nfs, so we can set default value as socket
*/
@@ -933,24 +1023,22 @@ nfs_init_state (xlator_t *this)
}
nfs->exports_auth = GF_NFS_DEFAULT_EXPORT_AUTH;
- if (dict_get(this->options, "nfs.exports-auth-enable")) {
+ if (dict_get (this->options, "nfs.exports-auth-enable")) {
ret = dict_get_str (this->options, "nfs.exports-auth-enable",
&optstr);
if (ret == -1) {
- gf_msg (GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
- "Failed to parse dict");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict");
goto free_foppool;
}
ret = gf_string2boolean (optstr, &boolt);
if (ret < 0) {
- gf_msg (GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
- "Failed to parse bool string");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse bool "
+ "string");
goto free_foppool;
}
- if (boolt == _gf_true)
- nfs->exports_auth = 1;
+ nfs->exports_auth = boolt;
}
nfs->auth_refresh_time_secs = GF_NFS_DEFAULT_AUTH_REFRESH_INTERVAL_SEC;
@@ -1190,6 +1278,7 @@ nfs_reconfigure_state (xlator_t *this, dict_t *options)
"nfs.transport-type",
"nfs.mem-factor",
NULL};
+ char *exports_auth_enable = NULL;
GF_VALIDATE_OR_GOTO (GF_NFS, this, out);
GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out);
@@ -1269,6 +1358,21 @@ nfs_reconfigure_state (xlator_t *this, dict_t *options)
"Reconfigured nfs.mount-rmtab path: %s", nfs->rmtab);
}
+ /* reconfig nfs.exports-auth-enable */
+ if (dict_get (options, "nfs.exports-auth-enable")) {
+ ret = dict_get_str (options, "nfs.exports-auth-enable",
+ &exports_auth_enable);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read "
+ "reconfigured option: nfs.exports-auth-enable");
+ goto out;
+ }
+ ret = gf_string2int (exports_auth_enable, &nfs->exports_auth);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
GF_OPTION_RECONF (OPT_SERVER_AUX_GIDS, optbool,
options, bool, out);
if (nfs->server_aux_gids != optbool) {
@@ -1520,6 +1624,13 @@ init (xlator_t *this) {
return (-1);
}
+ ret = nfs_janitor_init (nfs);
+ if (ret) {
+ gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_FAIL,
+ "Failed to initialize janitor");
+ return (-1);
+ }
+
gf_msg (GF_NFS, GF_LOG_INFO, 0, NFS_MSG_STARTED,
"NFS service started");
return (0); /* SUCCESS */
@@ -2019,7 +2130,7 @@ struct volume_options options[] = {
},
{ .key = {"nfs.mount-rmtab"},
.type = GF_OPTION_TYPE_PATH,
- .default_value = NFS_DATADIR "/rmtab",
+ .default_value = "/-",
.description = "Set the location of the cache file that is used to "
"list all the NFS-clients that have connected "
"through the MOUNT protocol. If this is on shared "
@@ -2075,7 +2186,7 @@ struct volume_options options[] = {
.description = "Sets the number of non-idempotent "
"requests to cache in drc"
},
- { .key = {"nfs.exports-auth-enable"},
+ { .key = {"nfs.*.exports-auth-enable"},
.type = GF_OPTION_TYPE_BOOL,
.description = "Set the option to 'on' to enable exports/netgroup "
"authentication in the NFS server and mount daemon."
diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h
index 9bcc88f5548..4f5faf29f6b 100644
--- a/xlators/nfs/server/src/nfs.h
+++ b/xlators/nfs/server/src/nfs.h
@@ -96,6 +96,8 @@ struct nfs_state {
uint32_t server_aux_gids_max_age;
gid_cache_t gid_cache;
uint32_t generation;
+ pthread_t janitor_thread;
+ xlator_t *this;
gf_boolean_t register_portmap;
char *rpc_statd;
char *rpc_statd_pid_file;
diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c
index 0b977092fbb..64bd08a3fc7 100644
--- a/xlators/nfs/server/src/nfs3-helpers.c
+++ b/xlators/nfs/server/src/nfs3-helpers.c
@@ -239,7 +239,12 @@ nfs3_errno_to_nfsstat3 (int errnum)
break;
case ENOTCONN:
- stat = NFS3ERR_IO;
+ /* If connections to bricks cannot be established,
+ * the filesystem is effectively in read-only mode
+ * to protect data. E.g., when all bricks in a subvolume
+ * crash.
+ */
+ stat = NFS3ERR_ROFS;
break;
case EDQUOT:
@@ -3975,11 +3980,18 @@ nfs3_fh_auth_nfsop (nfs3_call_state_t *cs, gf_boolean_t is_write_op)
{
struct nfs_state *nfs = NULL;
struct mount3_state *ms = NULL;
+ int auth_status = -1;
nfs = (struct nfs_state *)cs->nfsx->private;
ms = (struct mount3_state *)nfs->mstate;
- return mnt3_authenticate_request (ms, cs->req, &cs->resolvefh, NULL,
- NULL, NULL, NULL, is_write_op);
+ auth_status = mnt3_authenticate_request (ms, cs->req, &cs->resolvefh,
+ cs->vol->name, NULL, NULL,
+ NULL, is_write_op);
+
+ if (auth_status != 0) {
+ cs->resolve_errno = auth_status;
+ }
+ return auth_status;
}
int
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
index 8b1d62b46ac..2426028ed2d 100644
--- a/xlators/nfs/server/src/nfs3.c
+++ b/xlators/nfs/server/src/nfs3.c
@@ -211,6 +211,25 @@ out:
return ret;
}
+int
+nfs3_is_exports_auth (struct nfs3_state *nfs3, const char *volname)
+{
+ int ret = 0;
+ struct nfs3_export *exp = NULL;
+
+ GF_VALIDATE_OR_GOTO (GF_NFS3, nfs3, out);
+
+ list_for_each_entry (exp, &nfs3->exports, explist) {
+ if (strcmp (exp->subvol->name, volname) == 0) {
+ ret = exp->exports_auth;
+ break;
+ }
+ }
+
+out:
+ return ret;
+}
+
#define nfs3_map_fh_to_volume(nfs3state, handle, req, volume, status, label) \
do { \
@@ -413,6 +432,28 @@ out:
}
+/*
+ * This macro checks if the volume is started or not.
+ * If it is not started, it closes the client connection & logs it.
+ *
+ * Why do we do this?
+ *
+ * There is a "race condition" where gNFSd may start listening for RPC requests
+ * prior to the volume being started. Presumably, that is why this macro exists
+ * in the first place. In the NFS kernel client (specifically Linux's NFS
+ * kernel client), they establish a TCP connection to our endpoint and
+ * (re-)send requests. If we ignore the request, and return nothing back,
+ * the NFS kernel client waits forever for our response. If for some reason,
+ * the TCP connection were to die, and re-establish, the requests are
+ * retransmitted and everything begins working as expected
+ *
+ * Now, this is clearly bad behavior on the client side,
+ * but in order to make every user's life easier,
+ * gNFSd should simply disconnect the TCP connection if it sees requests
+ * before it is ready to accept them.
+ *
+ */
+
#define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl) \
do { \
if ((!nfs_subvolume_started (nfs_state (nf3stt->nfsx), vlm))){\
@@ -420,11 +461,32 @@ out:
NFS_MSG_VOL_DISABLE, \
"Volume is disabled: %s", \
vlm->name); \
+ nfs3_disconnect_transport (req->trans); \
rtval = RPCSVC_ACTOR_IGNORE; \
goto erlbl; \
} \
} while (0) \
+void
+nfs3_disconnect_transport (rpc_transport_t *transport)
+{
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO (GF_NFS3, transport, out);
+
+ ret = rpc_transport_disconnect (transport);
+ if (ret != 0) {
+ gf_log (GF_NFS3, GF_LOG_WARNING,
+ "Unable to close client connection to %s.",
+ transport->peerinfo.identifier);
+ } else {
+ gf_log (GF_NFS3, GF_LOG_WARNING,
+ "Closed client connection to %s.",
+ transport->peerinfo.identifier);
+ }
+out:
+ return;
+}
int
nfs3_export_sync_trusted (struct nfs3_state *nfs3, uuid_t exportid)
@@ -819,6 +881,12 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cs = frame->local;
if (op_ret == -1) {
+ /* Prevent crashes for the case where this call fails
+ * and buf is left in a NULL state, yet the op_errno == 0.
+ */
+ if (!buf && op_errno == 0) {
+ op_errno = EIO;
+ }
status = nfs3_cbk_errno_status (op_ret, op_errno);
}
@@ -5621,6 +5689,35 @@ no_dvm:
(exp->trusted_sync == 0)?"no trusted_sync":"trusted_sync",
(exp->trusted_write == 0)?"no trusted_write":"trusted_write");
ret = 0;
+
+ ret = snprintf (searchkey, 1024, "nfs.%s.exports-auth-enable", name);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "snprintf failed");
+ ret = -1;
+ goto err;
+ }
+
+ if (dict_get (options, searchkey)) {
+ ret = dict_get_str (options, searchkey, &optstr);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict");
+ goto err;
+ }
+
+ ret = gf_string2boolean (optstr, &boolt);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse bool "
+ "string");
+ goto err;
+ }
+
+ exp->exports_auth = boolt ? TRUE : FALSE;
+ if (boolt) {
+ struct nfs_state *priv = nfsx->private;
+ priv->nfs3state->exports_auth = boolt;
+ }
+ }
+
err:
return ret;
}
@@ -5727,6 +5824,7 @@ nfs3_init_state (xlator_t *nfsx)
goto ret;
}
+ nfs->nfs3state = nfs3;
nfs3->nfsx = nfsx;
nfs3->exportslist = nfsx->children;
INIT_LIST_HEAD (&nfs3->exports);
@@ -5749,7 +5847,6 @@ nfs3_init_state (xlator_t *nfsx)
goto free_localpool;
}
- nfs->nfs3state = nfs3;
ret = 0;
free_localpool:
diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h
index 4cb3e67528d..36d981c3eef 100644
--- a/xlators/nfs/server/src/nfs3.h
+++ b/xlators/nfs/server/src/nfs3.h
@@ -31,6 +31,7 @@
#define GF_NFS3_IOBPOOL_MULT GF_NFS_CONCURRENT_OPS_MULT
#define GF_NFS3_CLTABLE_BUCKETS_MULT 2
#define GF_NFS3_FDTABLE_BUCKETS_MULT 2
+#define GF_NFS3_DEFAULT_EXPORT_AUTH _gf_false
/* Static values used for FSINFO
@@ -45,7 +46,7 @@
#define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */
#define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */
-#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_FILE_IO_SIZE_DEF (512 * GF_UNIT_KB)
#define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX
#define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN
@@ -99,6 +100,7 @@ struct nfs3_export {
int trusted_sync;
int trusted_write;
int rootlookedup;
+ int exports_auth;
};
#define GF_NFS3_DEFAULT_VOLACCESS (GF_NFS3_VOLACCESS_RW)
@@ -142,6 +144,9 @@ typedef struct nfs3_state {
gf_lock_t fdlrulock;
int fdcount;
uint32_t occ_logger;
+
+ /* Enable exports auth model */
+ gf_boolean_t exports_auth;
} nfs3_state_t;
typedef enum nfs3_lookup_type {
@@ -280,4 +285,7 @@ nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options);
extern uint64_t
nfs3_request_xlator_deviceid (rpcsvc_request_t *req);
+extern int
+nfs3_is_exports_auth (struct nfs3_state *nfs3, const char *volname);
+
#endif
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
index 98c37746921..f199b229bc2 100644
--- a/xlators/performance/io-cache/src/io-cache.c
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -1479,6 +1479,74 @@ ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
return 0;
}
+int32_t
+ioc_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct statvfs *buf, dict_t *xdata)
+{
+ ioc_table_t *table = NULL;
+ struct ioc_statvfs *cache = NULL;
+
+ if (op_ret != 0)
+ goto out;
+
+ table = this->private;
+ cache = &table->statfs_cache;
+
+ LOCK (&cache->lock);
+
+ gettimeofday (&cache->tv, NULL);
+ cache->buf = *buf;
+
+ UNLOCK (&cache->lock);
+
+out:
+ STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+ioc_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ioc_table_t *table = NULL;
+ struct ioc_statvfs *cache = NULL;
+ struct statvfs buf;
+ struct timeval tv = {0,};
+
+ table = this->private;
+ cache = &table->statfs_cache;
+
+ if (!cache->enabled)
+ goto disabled;
+
+ gettimeofday (&tv, NULL);
+
+ LOCK (&cache->lock);
+
+ if (time_elapsed (&tv, &cache->tv) >= cache->timeout) {
+ UNLOCK (&cache->lock);
+ goto uncached;
+ }
+
+ buf = cache->buf;
+
+ UNLOCK (&cache->lock);
+
+ STACK_UNWIND_STRICT (statfs, frame, 0, 0, &buf, xdata);
+
+ return 0;
+
+disabled:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->statfs, loc, xdata);
+ return 0;
+
+uncached:
+ STACK_WIND (frame, ioc_statfs_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->statfs, loc, xdata);
+ return 0;
+}
int32_t
ioc_get_priority_list (const char *opt_str, struct list_head *first)
@@ -1696,6 +1764,13 @@ reconfigure (xlator_t *this, dict_t *options)
}
table->cache_size = cache_size_new;
+ GF_OPTION_RECONF ("statfs-cache", table->statfs_cache.enabled,
+ options, bool, unlock);
+
+ GF_OPTION_RECONF ("statfs-cache-timeout",
+ table->statfs_cache.timeout,
+ options, int32, unlock);
+
ret = 0;
}
unlock:
@@ -1755,6 +1830,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("max-file-size", table->max_file_size, size_uint64, out);
+ GF_OPTION_INIT ("statfs-cache", table->statfs_cache.enabled, bool, out);
+
+ GF_OPTION_INIT ("statfs-cache-timeout", table->statfs_cache.timeout, int32, out);
+
if (!check_cache_size_ok (this, table->cache_size)) {
ret = -1;
goto out;
@@ -1827,6 +1906,11 @@ init (xlator_t *this)
ctx = this->ctx;
ioc_log2_page_size = log_base2 (ctx->page_size);
+ LOCK_INIT (&table->statfs_cache.lock);
+ /* Invalidate statfs cache */
+ table->statfs_cache.tv.tv_sec = 0;
+ table->statfs_cache.tv.tv_usec = 0;
+
out:
if (ret == -1) {
if (table != NULL) {
@@ -2096,6 +2180,7 @@ fini (xlator_t *this)
GF_ASSERT (list_empty (&table->inode_lru[i]));
}
+ LOCK_DESTROY (&table->statfs_cache.lock);
GF_ASSERT (list_empty (&table->inodes));
*/
pthread_mutex_destroy (&table->table_lock);
@@ -2120,6 +2205,7 @@ struct xlator_fops fops = {
.readdirp = ioc_readdirp,
.discard = ioc_discard,
.zerofill = ioc_zerofill,
+ .statfs = ioc_statfs,
};
@@ -2171,5 +2257,21 @@ struct volume_options options[] = {
.description = "Maximum file size which would be cached by the "
"io-cache translator."
},
+ { .key = {"statfs-cache"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "0",
+ .description = "The cached statfs for a filesystem will be "
+ "till 'statfs-cache-timeout' seconds, after which re-validation "
+ "is performed."
+ },
+ { .key = {"statfs-cache-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 60,
+ .default_value = "1",
+ .description = "The cached statfs for a filesystem will be "
+ "till 'statfs-cache-timeout' seconds, after which re-validation "
+ "is performed."
+ },
{ .key = {NULL} },
};
diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h
index d7c823fe962..da71b2f2371 100644
--- a/xlators/performance/io-cache/src/io-cache.h
+++ b/xlators/performance/io-cache/src/io-cache.h
@@ -148,23 +148,32 @@ struct ioc_inode {
inode_t *inode;
};
+struct ioc_statvfs {
+ struct statvfs buf;
+ int32_t timeout;
+ struct timeval tv;
+ gf_boolean_t enabled;
+ gf_lock_t lock;
+};
+
struct ioc_table {
- uint64_t page_size;
- uint64_t cache_size;
- uint64_t cache_used;
- uint64_t min_file_size;
- uint64_t max_file_size;
- struct list_head inodes; /* list of inodes cached */
- struct list_head active;
- struct list_head *inode_lru;
- struct list_head priority_list;
- int32_t readv_count;
- pthread_mutex_t table_lock;
- xlator_t *xl;
- uint32_t inode_count;
- int32_t cache_timeout;
- int32_t max_pri;
- struct mem_pool *mem_pool;
+ uint64_t page_size;
+ uint64_t cache_size;
+ uint64_t cache_used;
+ uint64_t min_file_size;
+ uint64_t max_file_size;
+ struct list_head inodes; /* list of inodes cached */
+ struct list_head active;
+ struct list_head *inode_lru;
+ struct list_head priority_list;
+ int32_t readv_count;
+ pthread_mutex_t table_lock;
+ xlator_t *xl;
+ uint32_t inode_count;
+ int32_t cache_timeout;
+ int32_t max_pri;
+ struct mem_pool *mem_pool;
+ struct ioc_statvfs statfs_cache;
};
typedef struct ioc_table ioc_table_t;
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 72a82082563..7f9dc5f82a8 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -161,8 +161,6 @@ iot_worker (void *data)
THIS = this;
for (;;) {
- sleep_till.tv_sec = time (NULL) + conf->idle_time;
-
pthread_mutex_lock (&conf->mutex);
{
if (pri != -1) {
@@ -175,8 +173,11 @@ iot_worker (void *data)
break;
}
- conf->sleep_count++;
+ clock_gettime (CLOCK_REALTIME_COARSE,
+ &sleep_till);
+ sleep_till.tv_sec += conf->idle_time;
+ conf->sleep_count++;
ret = pthread_cond_timedwait (&conf->cond,
&conf->mutex,
&sleep_till);
@@ -232,14 +233,25 @@ int
do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri)
{
int ret = 0;
+ int active_count = 0;
pthread_mutex_lock (&conf->mutex);
{
__iot_enqueue (conf, stub, pri);
- pthread_cond_signal (&conf->cond);
-
- ret = __iot_workers_scale (conf);
+ /* If we have an ample supply of threads alive already
+ * it's massively more efficient to keep the ones you have
+ * busy vs making new ones and signaling everyone
+ */
+ active_count = conf->curr_count - conf->sleep_count;
+ if (conf->fops_per_thread_ratio == 0 || active_count == 0 ||
+ (conf->queue_size/active_count >
+ conf->fops_per_thread_ratio &&
+ active_count < conf->max_count)) {
+ pthread_cond_signal (&conf->cond);
+
+ ret = __iot_workers_scale (conf);
+ }
}
pthread_mutex_unlock (&conf->mutex);
@@ -266,6 +278,9 @@ iot_get_pri_meaning (iot_pri_t pri)
case IOT_PRI_MAX:
name = "invalid";
break;
+ case IOT_PRI_UNSPEC:
+ name = "unspecified";
+ break;
}
return name;
}
@@ -598,6 +613,34 @@ int
iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
+ iot_conf_t *conf = NULL;
+ dict_t *depths = NULL;
+ int i = 0;
+
+ conf = this->private;
+
+ if (conf && name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) {
+ // We explicitly do not want a reference count
+ // for this dict in this translator
+ depths = get_new_dict ();
+ if (!depths)
+ goto unwind_special_getxattr;
+
+ for (i = 0; i < IOT_PRI_MAX; i++) {
+ if (dict_set_int32 (depths,
+ (char *)fop_pri_to_string (i),
+ conf->queue_sizes[i]) != 0) {
+ dict_destroy (depths);
+ depths = NULL;
+ goto unwind_special_getxattr;
+ }
+ }
+
+unwind_special_getxattr:
+ STACK_UNWIND_STRICT (getxattr, frame, 0, 0, depths, xdata);
+ return 0;
+ }
+
IOT_FOP (getxattr, frame, this, loc, name, xdata);
return 0;
}
@@ -904,6 +947,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out);
+ GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+ options, int32, out);
+
GF_OPTION_RECONF ("high-prio-threads",
conf->ac_iot_limit[IOT_PRI_HI], options, int32, out);
@@ -978,6 +1024,9 @@ init (xlator_t *this)
GF_OPTION_INIT ("thread-count", conf->max_count, int32, out);
+ GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio,
+ int32, out);
+
GF_OPTION_INIT ("high-prio-threads",
conf->ac_iot_limit[IOT_PRI_HI], int32, out);
@@ -1140,6 +1189,20 @@ struct volume_options options[] = {
"perform concurrent IO operations"
},
+ { .key = {"fops-per-thread-ratio"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = IOT_MIN_FOP_PER_THREAD,
+ .max = IOT_MAX_FOP_PER_THREAD,
+ .default_value = "20",
+ .description = "The optimal ratio of threads to FOPs in the queue "
+ "we wish to achieve before creating a new thread. "
+ "The idea here is it's far cheaper to keep our "
+ "currently running threads busy than spin up "
+ "new threads or cause a stampeding herd of threads "
+ "to service a singlular FOP when you have a thread "
+ "which will momentarily become available to do the "
+ "work."
+ },
{ .key = {"high-prio-threads"},
.type = GF_OPTION_TYPE_INT,
.min = IOT_MIN_THREADS,
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index fa955b5954b..011d4a00f7f 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -34,20 +34,14 @@ struct iot_conf;
#define IOT_MIN_THREADS 1
#define IOT_DEFAULT_THREADS 16
-#define IOT_MAX_THREADS 64
+#define IOT_MAX_THREADS 256
+#define IOT_MIN_FOP_PER_THREAD 0
+#define IOT_MAX_FOP_PER_THREAD 2000
#define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024))
-typedef enum {
- IOT_PRI_HI = 0, /* low latency */
- IOT_PRI_NORMAL, /* normal */
- IOT_PRI_LO, /* bulk */
- IOT_PRI_LEAST, /* least */
- IOT_PRI_MAX,
-} iot_pri_t;
-
#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */
struct iot_least_throttle {
struct timeval sample_time; /* timestamp of current sample */
@@ -62,6 +56,7 @@ struct iot_conf {
pthread_cond_t cond;
int32_t max_count; /* configured maximum */
+ int32_t fops_per_thread_ratio;
int32_t curr_count; /* actual number of threads running */
int32_t sleep_count;
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index 30443761c56..c3baafdc1b6 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -33,6 +33,7 @@ struct mdc_conf {
gf_boolean_t cache_selinux;
gf_boolean_t force_readdirp;
gf_boolean_t cache_swift_metadata;
+ gf_boolean_t cache_all_xattrs;
};
@@ -792,6 +793,7 @@ struct checkpair {
static int
is_mdc_key_satisfied (const char *key)
{
+ unsigned int checked_keys = 0;
const char *mdc_key = NULL;
int i = 0;
@@ -801,11 +803,13 @@ is_mdc_key_satisfied (const char *key)
for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {
if (!mdc_keys[i].load)
continue;
+
+ checked_keys++;
if (strcmp (mdc_key, key) == 0)
return 1;
}
- return 0;
+ return 0;
}
@@ -875,7 +879,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xattr_rsp = NULL;
dict_t *xattr_alloc = NULL;
mdc_local_t *local = NULL;
-
+ struct mdc_conf *conf = this->private;
local = mdc_local_get (frame);
if (!local)
@@ -899,10 +903,17 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
if (ret != 0)
goto uncached;
- if (!mdc_xattr_satisfied (this, xdata, xattr_rsp))
+ /* Only check the keys if we are not caching all the xattrs */
+ if (!conf->cache_all_xattrs &&
+ !mdc_xattr_satisfied (this, xdata, xattr_rsp)) {
goto uncached;
+ }
}
+ gf_msg (this->name, GF_LOG_TRACE, 0, 0,
+ "Returning lookup from cache for gfid %s",
+ uuid_utoa(loc->inode->gfid));
+
MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf,
xattr_rsp, &postparent);
@@ -1882,6 +1893,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
int op_errno = ENODATA;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
+ struct mdc_conf *conf = this->private;
local = mdc_local_get (frame);
if (!local)
@@ -1897,7 +1909,18 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
goto uncached;
if (!xattr || !dict_get (xattr, (char *)key)) {
- ret = -1;
+ /* If we can't find the extended attribute, & cache-all-xattrs
+ * is enabled, we should wind and try to find them.
+ *
+ * NOTE: Quota & AFR queries through the mount
+ * (i.e, virtual Gluster xattrs)
+ * won't work unless we do this.
+ */
+ if (conf->cache_all_xattrs) {
+ goto uncached;
+ }
+
+ ret = -1;
op_errno = ENODATA;
}
@@ -2363,7 +2386,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out);
-
+ GF_OPTION_RECONF("cache-all-xattrs", conf->cache_all_xattrs, options,
+ bool, out);
out:
return 0;
}
@@ -2404,6 +2428,7 @@ init (xlator_t *this)
conf->cache_swift_metadata);
GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out);
+ GF_OPTION_INIT ("cache-all-xattrs", conf->cache_all_xattrs, bool, out);
out:
this->private = conf;
@@ -2474,7 +2499,7 @@ struct volume_options options[] = {
{ .key = {"md-cache-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .max = 60,
+ .max = 300,
.default_value = "1",
.description = "Time period after which cache has to be refreshed",
},
@@ -2484,5 +2509,19 @@ struct volume_options options[] = {
.description = "Convert all readdir requests to readdirplus to "
"collect stat info on each entry.",
},
+ { .key = {"strict-xattrs"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .description = "When reading extended attributes from the cache, "
+ "if an xattr is not found, attempt to find it by winding "
+ "instead of returning ENODATA. This is necessary to query "
+ "the special extended attributes (trusted.glusterfs.quota.size) "
+ "through a FUSE mount with md-cache enabled."
+ },
+ { .key = {"cache-all-xattrs"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Cache all the extended attributes for an inode.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 7f5719b1e48..bc59036ff88 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -169,6 +169,7 @@ typedef struct wb_request {
typedef struct wb_conf {
uint64_t aggregate_size;
+ uint64_t page_size;
uint64_t window_size;
gf_boolean_t flush_behind;
gf_boolean_t trickling_writes;
@@ -1207,18 +1208,21 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
char *ptr = NULL;
struct iobuf *iobuf = NULL;
struct iobref *iobref = NULL;
+ struct wb_conf *conf = NULL;
int ret = -1;
ssize_t required_size = 0;
size_t holder_len = 0;
size_t req_len = 0;
+ conf = req->wb_inode->this->private;
+
if (!holder->iobref) {
holder_len = iov_length (holder->stub->args.vector,
holder->stub->args.count);
req_len = iov_length (req->stub->args.vector,
req->stub->args.count);
- required_size = max ((THIS->ctx->page_size),
+ required_size = max ((conf->page_size),
(holder_len + req_len));
iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool,
required_size);
@@ -1281,7 +1285,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
wb_request_t *holder = NULL;
wb_conf_t *conf = NULL;
int ret = 0;
- ssize_t page_size = 0;
/* With asynchronous IO from a VM guest (as a file), there
can be two sequential writes happening in two regions
@@ -1292,7 +1295,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
through the interleaved ops
*/
- page_size = wb_inode->this->ctx->page_size;
conf = wb_inode->this->private;
list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) {
@@ -1343,7 +1345,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
- space_left = page_size - holder->write_size;
+ space_left = wb_inode->window_conf - holder->write_size;
if (space_left < req->write_size) {
holder->ordering.go = 1;
@@ -2471,6 +2473,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64,
out);
+ GF_OPTION_RECONF ("cache-size", conf->page_size, options, size_uint64,
+ out);
+
GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,
out);
@@ -2522,6 +2527,7 @@ init (xlator_t *this)
/* configure 'option window-size <size>' */
GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out);
+ GF_OPTION_INIT ("cache-size", conf->page_size, size_uint64, out);
if (!conf->window_size && conf->aggregate_size) {
gf_msg (this->name, GF_LOG_WARNING, 0,
diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c
index 988c1dce758..d0c63c18b46 100644
--- a/xlators/protocol/client/src/client-handshake.c
+++ b/xlators/protocol/client/src/client-handshake.c
@@ -15,6 +15,7 @@
#include "glusterfs.h"
#include "statedump.h"
#include "compat-errno.h"
+#include "latency.h"
#include "glusterfs3.h"
#include "portmap-xdr.h"
@@ -1542,7 +1543,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi
rpc_clnt_reconfig (conf->rpc, &config);
conf->skip_notify = 1;
- conf->quick_reconnect = 1;
+ conf->quick_reconnect = 1;
out:
if (frame)
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 66f15b8a67c..aa9cf9b31e4 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -467,7 +467,7 @@ int32_t
client_forget (xlator_t *this, inode_t *inode)
{
/* Nothing here */
- return 0;
+ return 0;
}
int32_t
@@ -545,7 +545,7 @@ out:
STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -571,7 +571,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -600,7 +600,7 @@ out:
STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -628,7 +628,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -657,7 +657,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -687,7 +687,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -718,7 +718,7 @@ out:
STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -748,7 +748,7 @@ out:
STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -778,7 +778,7 @@ out:
STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN,
NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -807,7 +807,7 @@ out:
STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN,
NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -837,7 +837,7 @@ out:
STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -867,7 +867,7 @@ out:
STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -897,7 +897,7 @@ out:
STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -932,7 +932,7 @@ out:
STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN,
NULL, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -965,7 +965,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1000,7 +1000,7 @@ out:
STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN,
NULL, 0, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1038,7 +1038,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1064,7 +1064,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1093,7 +1093,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1120,7 +1120,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1149,7 +1149,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1177,7 +1177,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1204,7 +1204,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
static gf_boolean_t
@@ -1393,7 +1393,7 @@ out:
if (need_unwind)
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL);
- return 0;
+ return 0;
}
@@ -1423,7 +1423,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1453,7 +1453,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1482,7 +1482,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1512,7 +1512,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1542,7 +1542,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1571,7 +1571,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1598,7 +1598,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1654,7 +1654,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1684,7 +1684,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1715,7 +1715,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1747,7 +1747,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1780,7 +1780,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL);
- return 0;
+ return 0;
}
@@ -1809,7 +1809,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1840,7 +1840,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1872,7 +1872,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL);
- return 0;
+ return 0;
}
@@ -1901,7 +1901,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -1929,7 +1929,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int32_t
@@ -2155,7 +2155,7 @@ out:
if (ret)
STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL);
- return 0;
+ return 0;
}
@@ -2227,6 +2227,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf = this->private;
switch (event) {
+ case RPC_CLNT_PING:
+ {
+ ret = default_notify (this, GF_EVENT_CHILD_PING, NULL);
+ if (ret)
+ gf_log (this->name, GF_LOG_INFO,
+ "CHILD_PING notify failed");
+ conf->last_sent_event = GF_EVENT_CHILD_PING;
+ break;
+ }
case RPC_CLNT_CONNECT:
{
conf->connected = 1;
@@ -2312,13 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf->connected = 0;
conf->skip_notify = 0;
- if (conf->quick_reconnect) {
- conf->quick_reconnect = 0;
- rpc_clnt_cleanup_and_start (rpc);
-
- } else {
+ if (conf->rpc->conn.connected) {
+ /* Having conf->connected false and
+ * conf->rpc->conn.connected true is an
+ * unrecoverable state, since rpc_clnt_reconnect
+ * will do nothing for an already connected connection.
+ * A good fix would be to ensure serialized
+ * delivery of transport messages, but that is super hard
+ * and this is rare. So... ghetto "fix", disconnect the
+ * RPC and start the race again. Maybe we'll win
+ * next time!
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "Client %s reconnect race detected, "
+ "restarting.", conf->rpc->conn.name);
+ conf->quick_reconnect = 1;
+ rpc_transport_disconnect (rpc->conn.trans);
rpc->conn.config.remote_port = 0;
-
+ } else {
+ if (conf->quick_reconnect) {
+ conf->quick_reconnect = 0;
+ rpc_clnt_cleanup_and_start (rpc);
+ } else {
+ rpc->conn.config.remote_port = 0;
+ }
}
break;
@@ -2670,7 +2696,7 @@ reconfigure (xlator_t *this, dict_t *options)
ret = 0;
out:
- return ret;
+ return ret;
}
@@ -2724,6 +2750,8 @@ init (xlator_t *this)
this->private = conf;
+ this->client_latency.min = UINT64_MAX;
+
/* If it returns -1, then its a failure, if it returns +1 we need
have to understand that 'this' is subvolume of a xlator which,
will set the remote host and remote subvolume in a setxattr
@@ -3001,7 +3029,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_TIME,
.min = 0,
.max = 1013,
- .default_value = "42",
+ .default_value = "180",
.description = "Time duration for which the client waits to "
"check if the server is responsive."
},
diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c
index 1ad45394dd7..a1fe2e85267 100644
--- a/xlators/protocol/server/src/server-resolve.c
+++ b/xlators/protocol/server/src/server-resolve.c
@@ -11,6 +11,7 @@
#include "server.h"
#include "server-helpers.h"
#include "server-messages.h"
+#include "compat-errno.h"
int
@@ -58,6 +59,10 @@ resolve_gfid_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
resolve = state->resolve_now;
resolve_loc = &resolve->resolve_loc;
+ if (!state->loc.inode && inode) {
+ state->loc.inode = inode_ref (inode);
+ }
+
if (op_ret == -1) {
if (op_errno == ENOENT) {
gf_msg_debug (this->name, 0, "%s/%s: failed to resolve"
@@ -71,7 +76,9 @@ resolve_gfid_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_utoa (resolve_loc->pargfid),
resolve_loc->name, strerror (op_errno));
}
- goto out;
+ if (op_errno != ENODATA) {
+ goto out;
+ }
}
link_inode = inode_link (inode, resolve_loc->parent,
diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c
index d5410573ac3..ee8ce825098 100644
--- a/xlators/protocol/server/src/server-rpc-fops.c
+++ b/xlators/protocol/server/src/server-rpc-fops.c
@@ -33,6 +33,10 @@
void
forget_inode_if_no_dentry (inode_t *inode)
{
+ if (!inode) {
+ return;
+ }
+
if (!inode_has_dentry (inode))
inode_forget (inode, 0);
@@ -4644,7 +4648,7 @@ server3_3_unlink (rpcsvc_request_t *req)
goto out;
}
- state->resolve.type = RESOLVE_MUST;
+ state->resolve.type = RESOLVE_MAY;
state->resolve.bname = gf_strdup (args.bname);
memcpy (state->resolve.pargfid, args.pargfid, 16);
@@ -5642,7 +5646,7 @@ server3_3_rmdir (rpcsvc_request_t *req)
goto out;
}
- state->resolve.type = RESOLVE_MUST;
+ state->resolve.type = RESOLVE_MAY;
memcpy (state->resolve.pargfid, args.pargfid, 16);
state->resolve.bname = gf_strdup (args.bname);
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index d8ef5f7b73f..636108affbb 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ goto err;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL,
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
index d3f48f859bf..558755af009 100644
--- a/xlators/storage/posix/src/posix-handle.c
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -210,6 +210,12 @@ posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize,
goto out;
}
+ if (!inode && path) {
+ gf_log (this->name, GF_LOG_WARNING, "OOPS: Failed to resolve"
+ "path (%s), inode is null. Bailing!", path);
+ goto out;
+ }
+
ret = posix_make_ancestral_node (priv_base_path, path, pathsize, head,
dir_name, &iabuf, inode, type, xdata);
if (*parent != NULL) {
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 76e32a31594..4aa39514486 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -485,18 +485,21 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data,
"Failed to set dictionary value for %s",
key);
}
- } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) {
+ } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY) &&
+ filler->loc && filler->loc->inode &&
+ !gf_uuid_is_null (filler->loc->inode->gfid)) {
/* As of now, the only consumers of POSIX_ANCESTRY_PATH attempt
* fetching it via path-based fops. Hence, leaving it as it is
* for now.
*/
if (!filler->real_path)
goto out;
+
char *path = NULL;
ret = posix_get_ancestry (filler->this, filler->loc->inode,
NULL, &path, POSIX_ANCESTRY_PATH,
&filler->op_errno, xattr_req);
- if (ret < 0) {
+ if (ret < 0 || !path) {
goto out;
}
@@ -856,6 +859,7 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
int ret = 0;
ssize_t size = 0;
struct stat stat = {0, };
+ char *new_uuid = NULL;
if (!xattr_req)
@@ -864,12 +868,6 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
if (sys_lstat (path, &stat) != 0)
goto out;
- size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
- if (size == 16) {
- ret = 0;
- goto verify_handle;
- }
-
ret = dict_get_ptr (xattr_req, "gfid-req", &uuid_req);
if (ret) {
gf_msg_debug (this->name, 0,
@@ -878,7 +876,28 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
goto out;
}
- ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE);
+ size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (size == 16) {
+ if (!gf_uuid_compare (uuid_curr, uuid_req)) {
+ ret = 0;
+ goto verify_handle;
+ }
+
+ /* File has an existing GFID which differs from
+ * the requested one. This can occur when a subvolume
+ * has been offline while a file is deleted, and then
+ * comes back up but has not yet healed. Get rid of
+ * the old GFID link (handle_unset) and fall through
+ * to the set case below.
+ */
+ new_uuid = strdupa (uuid_utoa (uuid_req));
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: existing gfid %s overwritten with %s.",
+ path, uuid_utoa (uuid_curr), new_uuid);
+ posix_handle_unset (this, uuid_curr, NULL);
+ }
+
+ ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, 0);
if (ret == -1) {
gf_msg (this->name, GF_LOG_WARNING, errno, P_MSG_GFID_FAILED,
"setting GFID on %s failed ", path);
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index dfb7e05e49a..e56e71e8c27 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -183,8 +183,15 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless);
op_ret = -1;
if (gf_uuid_is_null (loc->pargfid) || (loc->name == NULL)) {
- /* nameless lookup */
- MAKE_INODE_HANDLE (real_path, this, loc, &buf);
+ if (gf_uuid_is_null (loc->gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "OOPS: Namless lookup with null gfid!");
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto out;
+ } else {
+ MAKE_INODE_HANDLE (real_path, this, loc, &buf);
+ }
} else {
MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf);
@@ -220,7 +227,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
}
if (priv->update_pgfid_nlinks) {
- if (!gf_uuid_is_null (loc->pargfid) && !IA_ISDIR (buf.ia_type)) {
+ if (!gf_uuid_is_null (loc->pargfid)) {
MAKE_PGFID_XATTR_KEY (pgfid_xattr_key,
PGFID_XATTR_KEY_PREFIX,
loc->pargfid);
@@ -691,6 +698,81 @@ out:
return 0;
}
+static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats,
+ double min_free_disk,
+ gf_boolean_t previously_ok)
+{
+ gf_boolean_t currently_ok;
+
+ if (min_free_disk < 100.0) {
+ double free_percent = 100.0 * stats->f_bavail / stats->f_blocks;
+
+ currently_ok =
+ free_percent >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free percent "
+ "%f%% < %f%%. Writes disabled.",
+ free_percent, min_free_disk);
+ }
+ } else {
+ double free_bytes = stats->f_bavail * stats->f_frsize;
+
+ currently_ok =
+ free_bytes >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free bytes %f "
+ "< %f. Writes disabled.",
+ free_bytes, min_free_disk);
+ }
+ }
+
+ if (currently_ok && !previously_ok) {
+ gf_log (this->name, GF_LOG_INFO, "Free space has risen above "
+ "min-free-disk limit, writes "
+ "re-enabled.");
+ }
+
+ return currently_ok;
+}
+
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv)
+{
+ /* Check if there is sufficient free space to allow writes.
+ *
+ * This is called in the write path, so performance matters. We
+ * periodically sample free space by calling statvfs().
+ * freespace_check_lock is used to ensure only one process at a
+ * time makes the call; if the lock is contended, the previous
+ * status (reflected in freespace_check_passed) is used while
+ * the process that holds the mutex updates the current status.
+ */
+ if (!priv->freespace_check_interval) {
+ return _gf_true;
+ }
+
+ if (!pthread_mutex_trylock (&priv->freespace_check_lock)) {
+ struct timespec now;
+
+ clock_gettime (CLOCK_MONOTONIC, &now);
+ if (now.tv_sec >= priv->freespace_check_last.tv_sec +
+ priv->freespace_check_interval) {
+ sys_statvfs (priv->base_path, &priv->freespace_stats);
+ priv->freespace_check_last.tv_sec = now.tv_sec;
+
+ priv->freespace_check_passed = freespace_ok (
+ this, &priv->freespace_stats, priv->min_free_disk,
+ priv->freespace_check_passed);
+ }
+
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+ }
+
+ return priv->freespace_check_passed;
+}
+
static int32_t
posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t flags, off_t offset, size_t len,
@@ -700,6 +782,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_errno = 0;
struct posix_fd *pfd = NULL;
gf_boolean_t locked = _gf_false;
+ struct posix_private *priv = this->private;
posix_inode_ctx_t *ctx = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -709,6 +792,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ if (!posix_write_ok (this, priv)) {
+ ret = -ENOSPC;
+ goto out;
+ }
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
@@ -2514,8 +2603,7 @@ posix_rename (call_frame_t *frame, xlator_t *this,
pthread_mutex_lock (&ctx_old->pgfid_lock);
{
- if (!IA_ISDIR (oldloc->inode->ia_type)
- && priv->update_pgfid_nlinks) {
+ if (priv->update_pgfid_nlinks) {
MAKE_PGFID_XATTR_KEY (pgfid_xattr_key,
PGFID_XATTR_KEY_PREFIX,
oldloc->pargfid);
@@ -2581,8 +2669,7 @@ posix_rename (call_frame_t *frame, xlator_t *this,
P_MSG_SET_XDATA_FAIL, "failed to set "
GET_LINK_COUNT" for %s", real_newpath);
- if (!IA_ISDIR (oldloc->inode->ia_type)
- && priv->update_pgfid_nlinks) {
+ if (priv->update_pgfid_nlinks) {
MAKE_PGFID_XATTR_KEY (pgfid_xattr_key,
PGFID_XATTR_KEY_PREFIX,
newloc->pargfid);
@@ -3386,6 +3473,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (priv, out);
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ op_ret = -1;
+ goto out;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL,
@@ -4335,6 +4428,12 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode,
op_errno, xdata);
}
+ if (ret == 0 && path && !*path) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to resolve ancestry path, pgfid "
+ "attribute isn't set (yet).");
+ ret = -1;
+ }
out:
if (ret && path && *path) {
GF_FREE (*path);
@@ -4555,7 +4654,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this,
goto done;
}
- if (loc->inode && name
+ if (loc->inode && !gf_uuid_is_null(loc->inode->gfid) && name
&& (strcmp (name, GET_ANCESTRY_PATH_KEY) == 0)) {
int type = POSIX_ANCESTRY_PATH;
@@ -6761,6 +6860,16 @@ struct posix_private *priv = NULL;
options, uint32, out);
posix_spawn_health_check_thread (this);
+ pthread_mutex_lock (&priv->freespace_check_lock);
+ {
+ GF_OPTION_RECONF ("freespace-check-interval",
+ priv->freespace_check_interval,
+ options, uint32, out);
+ GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options,
+ percent_or_size, out);
+ }
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+
ret = 0;
out:
return ret;
@@ -7375,6 +7484,19 @@ init (xlator_t *this)
GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
uint32, out);
+
+ GF_OPTION_INIT ("freespace-check-interval",
+ _private->freespace_check_interval, uint32, out);
+
+ GF_OPTION_INIT ("min-free-disk", _private->min_free_disk,
+ percent_or_size, out);
+
+ pthread_mutex_init (&_private->freespace_check_lock, NULL);
+ sys_statvfs (_private->base_path, &_private->freespace_stats);
+ clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last);
+ _private->freespace_check_passed = freespace_ok (
+ this, &_private->freespace_stats, _private->min_free_disk,
+ _gf_true);
out:
return ret;
}
@@ -7539,7 +7661,7 @@ struct volume_options options[] = {
},
{ .key = {"update-link-count-parent"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
+ .default_value = "on",
.description = "Enable placeholders for gfid to path conversion"
},
#if GF_DARWIN_HOST_OS
@@ -7552,5 +7674,22 @@ struct volume_options options[] = {
"\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n"
},
#endif
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "2%",
+ .description = "Minimum percentage/size of disk space, after which we"
+ "start failing writes with ENOSPC."
+ },
+ {
+ .key = {"freespace-check-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "5",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Interval in seconds between freespace measurements "
+ "used for the min-free-disk determination. "
+ "Set to 0 to disable."
+ },
+
{ .key = {NULL} }
};
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index febd4326aa1..a2e1201dd72 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -174,7 +174,14 @@ struct posix_private {
XATTR_BOTH,
} xattr_user_namespace;
#endif
-
+ /* freespace_check_lock protects access to following three fields. */
+ pthread_mutex_t freespace_check_lock;
+ struct timespec freespace_check_last;
+ struct statvfs freespace_stats;
+ double min_free_disk;
+ /* mutex protection ends. */
+ uint32_t freespace_check_interval;
+ gf_boolean_t freespace_check_passed;
};
typedef struct {
@@ -280,6 +287,9 @@ posix_handle_georep_xattrs (call_frame_t *, const char *, int *, gf_boolean_t);
void
posix_gfid_unset (xlator_t *this, dict_t *xdata);
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv);
+
int
posix_pacl_set (const char *path, const char *key, const char *acl_s);