diff options
177 files changed, 9731 insertions, 1136 deletions
diff --git a/Makefile.am b/Makefile.am index d36f53055ea..c6f5618b541 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,3 +1,4 @@ +SOURCES = site.h EXTRA_DIST = autogen.sh \ COPYING-GPLV2 COPYING-LGPLV3 \ INSTALL README.md AUTHORS THANKS NEWS \ diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c index 8c9872cfa53..5d08114c8c5 100644 --- a/api/src/glfs-mgmt.c +++ b/api/src/glfs-mgmt.c @@ -911,7 +911,8 @@ glfs_mgmt_init (struct glfs *fs) if (!strcmp (cmd_args->volfile_server_transport, "unix")) { ret = rpc_transport_unix_options_build (&options, host, 0); } else { - ret = rpc_transport_inet_options_build (&options, host, port); + ret = rpc_transport_inet_options_build (&options, host, port, + NULL); } if (ret) diff --git a/build.sh b/build.sh new file mode 100755 index 00000000000..2eb5ae75424 --- /dev/null +++ b/build.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +PACKAGES="automake libtool gperftools-devel gperftools-debuginfo gperftools-libs \ + glib2-devel jemalloc jemalloc-devel fb-gcc flex bison openssl-devel libxml2-devel\ + libacl-devel userspace-rcu-devel lvm2 python-devel" + +if [ $(/usr/lib/rpm/redhat/dist.sh --distnum) -eq "7" ]; then + PACKAGES="$PACKAGES libtirpc libtirpc-devel-0.2.4 devtoolset-4-binutils devtoolset-4-gcc devtoolset-4-runtime" +elif [ $(/usr/lib/rpm/redhat/dist.sh --distnum) -eq "6" ]; then + PACKAGES="$PACKAGES libfbtirpc libfbtirpc-devel libgssglue libgssglue-devel devtoolset-2-binutils devtoolset-2-gcc devtoolset-2-runtime" +else + echo "Centos $(/usr/lib/rpm/redhat/dist.sh --distnum) is not currently supported" + exit 1 +fi + +# Skip this for Jekins automated builds (they have these packages already) +# as the sudo will cause the build to fail +[ $USER == "svcscm" ] || sudo yum install $PACKAGES -y + +source ./build_env + +./autogen.sh || exit 1 +./configure $GF_CONF_OPTS +make -j || exit 1 diff --git a/build_env b/build_env new file mode 100644 index 00000000000..74f7c0256e1 --- /dev/null +++ b/build_env @@ -0,0 +1,55 @@ +#!/bin/bash + +# +# Note: The GF_CONF_OPTS (configure) options will only be used for dev or +# test builds. For RPM builds the configure options are defined in the +# glusterfs.spec.in file. ASAN is not enabled by for RPMs as it degrades +# performance. In specific instances it can be enabled simply be appending +# the --with-asan option in the %build step of the spec file. +# + +GF_CONF_OPTS="--localstatedir=/var --sysconfdir /var/lib --prefix /usr --libdir /usr/lib64 \ + --enable-fusermount --enable-api --with-jemalloc \ + --with-ipv6-default --with-fbextras --disable-tiering" + +if [ -x /usr/lib/rpm/redhat/dist.sh ]; then + REDHAT_MAJOR=$(/usr/lib/rpm/redhat/dist.sh --distnum) +else + REDHAT_MAJOR=0 +fi + +# Enable systemd support on CentOS >= 7 +if [ $REDHAT_MAJOR -ge 7 ]; then + GF_CONF_OPTS="$GF_CONF_OPTS --with-systemd" +fi +export GF_CONF_OPTS + +ASAN_ENABLED=0 +# Check if ASAN is enabled +if [ "$ASAN_ENABLED" -eq "1" ]; then + GF_CONF_OPTS="$GF_CONF_OPTS --with-asan" +fi + +if [ $REDHAT_MAJOR -eq "7" ]; then + GCC_BIN="/opt/rh/devtoolset-4/root/usr/bin/gcc" + GCC_LIB="/opt/rh/devtoolset-4/root/lib64" + DESTDIR='/' # pycompile is finicky in centos7 if --destdir is passed nothing. +elif [ $REDHAT_MAJOR -eq "6" ]; then + ENGSHARE_GCC_PATH="/mnt/vol/engshare/third-party2/gcc" + GCC_BIN="$ENGSHARE_GCC_PATH/4.9.x/centos6-native/108cf83/bin/gcc" + GCC_LIB="$ENGSHARE_GCC_PATH/4.9.x/centos6-native/108cf83/lib64" +else + echo "Centos $REDHAT_MAJOR is not currently supported" + exit 1 +fi + +export LIB_DIR="$GCC_LIB" +export CC="$GCC_BIN" + +#export CC="/mnt/vol/engshare/third-party2/gcc/4.9.x/centos6-native/108cf83/bin/gcc" + +# If you think this should all be done in configure.ac you'd be 100% +# correct; aside from the fact that it simply doesn't work when done there :). +# You'll find the debug symbols are not present in resultant binaries nor is +# the code un-optimized. +export CFLAGS="-O0 -ggdb -fPIC -Wall -Werror -L${LIB_DIR}" diff --git a/cli/src/cli.c b/cli/src/cli.c index 2ecaae415d6..58fd9104f81 100644 --- a/cli/src/cli.c +++ b/cli/src/cli.c @@ -586,6 +586,11 @@ cli_rpc_init (struct cli_state *state) int ret = -1; int port = CLI_GLUSTERD_PORT; xlator_t *this = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif this = THIS; cli_rpc_prog = &cli_prog; @@ -621,7 +626,8 @@ cli_rpc_init (struct cli_state *state) goto out; ret = dict_set_str (options, "transport.address-family", - "inet"); + addr_family); + if (ret) goto out; } @@ -706,7 +712,7 @@ main (int argc, char *argv[]) if (ret) goto out; - cli_default_conn_timeout = 120; + cli_default_conn_timeout = 600; cli_ten_minutes_timeout = 600; ret = cli_state_init (&state); diff --git a/configure.ac b/configure.ac index 1ab3c996d3c..97eb137c752 100644 --- a/configure.ac +++ b/configure.ac @@ -33,7 +33,7 @@ if libtool --help 2>&1 | grep -q quiet; then AM_LIBTOOLFLAGS="--quiet"; fi -AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_HEADERS([config.h site.h]) AC_CONFIG_FILES([Makefile libglusterfs/Makefile @@ -72,6 +72,8 @@ AC_CONFIG_FILES([Makefile xlators/cluster/Makefile xlators/cluster/afr/Makefile xlators/cluster/afr/src/Makefile + xlators/cluster/aha/Makefile + xlators/cluster/aha/src/Makefile xlators/cluster/stripe/Makefile xlators/cluster/stripe/src/Makefile xlators/cluster/dht/Makefile @@ -275,7 +277,19 @@ if test "x$enable_debug" = "xyes"; then CFLAGS="${CFLAGS} -g -O0 -DDEBUG" else BUILD_DEBUG=no - CFLAGS="${CFLAGS} -g -O2" + CFLAGS="${CFLAGS} -g" +fi + +AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.])) +if test "x$with_fbextras" = "xyes"; then + BUILD_FBEXTRAS=yes +else + BUILD_FBEXTRAS=no +fi + +AC_ARG_ENABLE([privport_prefer], AC_HELP_STRING([--disable-privport_prefer], [Disable preferred usage of privleged ports.])) +if test "x$enable_privport_prefer" = "xno"; then + CFLAGS="${CFLAGS} -DNO_PRIVPORT" fi case $host_os in @@ -349,6 +363,10 @@ AC_ARG_WITH([ocf], ) AC_SUBST(OCF_SUBDIR) +AC_ARG_WITH(asan,--with-asan,,with_asan="no") +AC_ARG_WITH(tsan,--with-tsan,,with_tsan="no") +AC_ARG_WITH(jemalloc,--with-jemalloc,,with_jemalloc="no") + # LEX needs a check AC_PROG_LEX if test "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then @@ -908,6 +926,71 @@ AC_SUBST(GF_DISTRIBUTION) GF_HOST_OS="" GF_LDFLAGS="-rdynamic" +BUILD_ASAN=no +if test "x$with_asan" = "xyes"; then + echo -n "checking for address sanitizer (ASAN) support... " + AC_LANG_CONFTEST([AC_LANG_PROGRAM()]) + $CC conftest.c $CFLAGS -fsanitize=address -o conftest + ret=$? + rm -f conftest.o conftest + if test $ret -eq 0 ; then + echo "yes" + BUILD_ASAN=yes + GF_CFLAGS="$GF_CFLAGS -DASAN -fsanitize=address -O0 -ggdb" + GF_LDFLAGS="-gdb -static-libasan $GF_LDFLAGS" + else + echo "no" + echo "ERROR: ASAN not supported by compiler ($CC)" + exit 1 + fi +fi + +BUILD_TSAN=no +if test "x$with_tsan" = "xyes"; then + echo -n "checking for thread sanitizer (TSAN) support... " + AC_LANG_CONFTEST([AC_LANG_PROGRAM()]) + $CC conftest.c $CFLAGS -fsanitize=address -o conftest > /dev/null 2> /dev/null + ret=$? + rm -f conftest.o conftest + if test $ret -eq 0 ; then + echo "yes" + BUILD_TSAN=yes + GF_CFLAGS="$GF_CFLAGS -fsanitize=thread -O0 -ggdb -fPIC -pie" + GF_LDFLAGS="-gdb -static-libtsan $GF_LDFLAGS" + else + echo "no" + echo "ERROR: TSAN not supported by compiler ($CC)" + exit 1 + fi +fi + +BUILD_JEMALLOC=no +if test "x$with_jemalloc" = "xyes"; then + echo -n "checking for jemalloc support... " + AC_LANG_CONFTEST([AC_LANG_PROGRAM()]) + $CC conftest.c $CFLAGS -ljemalloc -o conftest > /dev/null 2> /dev/null + ret=$? + rm -f conftest.o conftest + if test $ret -eq 0 ; then + echo "yes" + BUILD_JEMALLOC=yes + GF_LDFLAGS="-ljemalloc $GF_LDFLAGS" + else + echo "no" + echo "ERROR: jemalloc linking error" + exit 1 + fi +fi + +TESTER_CFLAGS="" +dnl include tirpc for FB builds +if test "x$BUILD_FBEXTRAS" = "xyes"; then + TIRPC_CFLAGS="-I/usr/include/tirpc" + GF_LDFLAGS="-ltirpc $GF_LDFLAGS" + GF_CFLAGS="$TIRPC_CFLAGS $GF_CFLAGS -DIPV6_DEFAULT -DGF_FBEXTRAS" + TESTER_CFLAGS="$TESTER_CFLAGS -ltirpc" +fi + dnl check for gcc -Werror=format-security saved_CFLAGS=$CFLAGS CFLAGS="-Wformat -Werror=format-security" @@ -1099,6 +1182,12 @@ AC_ARG_ENABLE([debug], AC_HELP_STRING([--enable-debug], [Enable debug build options.])) +AC_ARG_ENABLE([mempool], + AC_HELP_STRING([--disable-mempool], + [Disable the Gluster memory pooler.])) +if test "x$enable_mempool" = "xno"; then + CFLAGS="${CFLAGS} -DDISABLE_MEMPOOL" +fi # syslog section AC_ARG_ENABLE([syslog], @@ -1287,19 +1376,21 @@ CONTRIBDIR='$(top_srcdir)/contrib' AC_SUBST(CONTRIBDIR) GF_CPPDEFINES='-D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS)' -GF_CPPINCLUDES='-include $(top_builddir)/config.h -I$(top_srcdir)/libglusterfs/src -I$(top_builddir)/libglusterfs/src' +GF_CPPINCLUDES='-include $(top_builddir)/config.h -include $(top_builddir)/site.h -I$(top_srcdir)/libglusterfs/src -I$(top_builddir)/libglusterfs/src' GF_CPPFLAGS="$GF_CPPFLAGS $GF_CPPDEFINES $GF_CPPINCLUDES" AC_SUBST([GF_CPPFLAGS]) AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS") AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS") AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS") +AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes") AC_SUBST(GLUSTERD_WORKDIR) AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd ) AC_SUBST(GLUSTERD_VOLFILE) AC_SUBST(GLUSTERFS_LIBEXECDIR) AC_SUBST(GLUSTERFSD_MISCDIR) +AC_SUBST(TESTER_CFLAGS) dnl pkg-config versioning dnl @@ -1361,4 +1452,7 @@ echo "POSIX ACLs : $BUILD_POSIX_ACLS" echo "Data Classification : $BUILD_GFDB" echo "firewalld-config : $BUILD_FIREWALLD" echo "Experimental xlators : $BUILD_EXPERIMENTAL" +echo "ASAN enabled : $BUILD_ASAN" +echo "TSAN enabled : $BUILD_TSAN" +echo "jemalloc enabled : $BUILD_JEMALLOC" echo diff --git a/fb-smoke.sh b/fb-smoke.sh new file mode 100755 index 00000000000..a68b9414cd2 --- /dev/null +++ b/fb-smoke.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# +# Tests +# +source ./test_env + +# +# Helpers +# +function elapsed_since { + start=$1 + (("$SECONDS" - "$start")) +} + +function flaky { + local f + for f in ${KNOWN_FLAKY_TESTS}; do + if [ "$f" == "$1" ]; then + return 0 + fi + done + return 1 +} + +function outfile { + printf "/tmp/%s.out" "$(echo "$f" | tr / -)" +} + +function exit_test { + if [ "$STOP_ON_FAIL" -eq "1" ]; then + print_result + exit "$1" + fi +} + +function test { + f=$1 + printf "%s" "$f" + local start + start=$SECONDS + local out + out=$(outfile "$f") + + for i in $(seq 1 "$ATTEMPT"); do + DEBUG=1 timeout --foreground "$TEST_TIMEOUT" prove -v "$f" &> "$out.$i" + + if [ "$?" -eq "0" ]; then + SUCCESS=$SUCCESS+1 + printf " PASS (%s s)\n" "$(elapsed_since $start)" + rm -f "$out.$i" + return 0 + else + printf " %s" "($i/$ATTEMPT)" + fi + done + + if [[ $? -eq 124 || $? -eq 137 ]]; then + FAILED_TESTS+=($f) + FAIL=$FAIL+1 + printf " TIMEOUT (%s s)\n" "$(elapsed_since $start)" + exit_test 1 + else + FAILED_TESTS+=($f) + FAIL=$FAIL+1 + printf " FAIL (%s s)\n" "$(elapsed_since $start)" + exit_test 1 + fi +} + +function flakytest { + f=$1 + + if [ "$SKIP_FLAKY" -eq "1" ]; then + SKIP=$SKIP+1 + else + printf "<flaky> " + test "$f" + fi +} + +function print_result { + echo + echo "== RESULTS ==" + echo "TESTS : $TOTAL" + echo "SUCCESS : $SUCCESS" + echo "FAIL : $FAIL" + echo "SKIP : $SKIP" + + if [ "$FAIL" -gt "0" ]; then + echo + echo "== FAILED TESTS ==" + echo "${FAILED_TESTS[@]}" + echo + echo "== LOGS ==" + "ls /tmp/*.out.*" + echo + echo "== END ==" + fi +} + +function run_remote { + if [ ! -d "$FBCODE" ]; then + echo "fbcode does not exists. Please checkout fbcode" + return 1 + fi + + local flags='' + if [ "$VERBOSE" -eq "1" ]; then + flags="$flags -v" + fi + + if [ "$VALGRIND" -eq "1" ]; then + flags="$flags --valgrind" + fi + + if [ "$ASAN" -eq "1" ]; then + flags="$flags --asan" + fi + + "$FBCODE/storage/gluster/gluster-build/fb-gluster-test.py" $flags --tester \ + --n "$N" --hosts "$REMOTE_HOSTS" --tests "$REMOTE_TESTS"\ + --flaky_tests "$REMOTE_FLAKY_TESTS" +} + +# +# Main +# +declare -i TOTAL=0 +declare -i SUCCESS=0 +declare -i FAIL=0 +declare -i SKIP=0 +declare -a FAILED_TESTS + +TEST_TIMEOUT=${TEST_TIMEOUT:=300} +SKIP_FLAKY=${SKIP_FLAKY:=1} +STOP_ON_FAIL=${STOP_ON_FAIL:=0} +FBCODE=${FBCODE:="$HOME/fbsource/fbcode"} +N=${N:=0} +REMOTE_HOSTS=${REMOTE_HOSTS:="$(smcc ls-hosts -s gluster.build.ash | xargs)"} +REMOTE=${REMOTE:=0} +REMOTE_TESTS=${REMOTE_TESTS:=$DESIRED_TESTS} +REMOTE_FLAKY_TESTS=${REMOTE_FLAKY_TESTS:=$KNOWN_FLAKY_TESTS} +VERBOSE=${VERBOSE:=0} +VALGRIND=${VALGRIND:=0} +ASAN=${ASAN:=0} + +if [ "$REMOTE" -eq "1" ]; then + run_remote + exit $? +fi + +if [ "$SKIP_FLAKY" -eq "0" ]; then + ATTEMPT=${ATTEMPT:=3} +else + ATTEMPT=${ATTEMPT:=1} +fi + +echo "== SETTINGS ==" +echo "TEST_TIMEOUT = $TEST_TIMEOUT s" +echo "SKIP_FLAKY = $SKIP_FLAKY" +echo "STOP_ON_FAIL = $STOP_ON_FAIL" +echo "ATTEMPT = $ATTEMPT" +echo "REMOTE = $REMOTE" +echo "FBCODE = $FBCODE" +echo + +# try cleaning up the environment +rm -f /tmp/*.out.* || true + +# sanity check +if ! cmp -s ./glusterfsd/src/.libs/glusterfsd $(which glusterfsd) +then + echo "Installed gluster does not match local, perhaps you ought make install?" + exit 1 +fi + +echo "== TESTS ==" +for f in ${DESIRED_TESTS} +do + TOTAL=$TOTAL+1 + if flaky "$f"; then + flakytest "$f" + else + test "$f" + fi +done + +print_result +exit $FAIL diff --git a/glusterfs.spec.in b/glusterfs.spec.in index 66c9a46a2be..6c30a955977 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -13,6 +13,10 @@ # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with debug %{?_with_debug:%global _with_debug --enable-debug} +# if you wish to compile an rpm with Facebook specfic extras... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras +%{?_with_fbextras:%global _with_fbextras --with-fbextras} + # if you wish to compile an rpm with cmocka unit testing... # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka %{?_with_cmocka:%global _with_cmocka --enable-cmocka} @@ -196,6 +200,10 @@ BuildRequires: libxml2-devel openssl-devel BuildRequires: libaio-devel libacl-devel BuildRequires: python-devel BuildRequires: python-ctypes +%if ( 0%{?_with_fbextras:1} ) +BuildRequires: fb-libtirpc fb-libtirpc-devel +BuildRequires: jemalloc jemalloc-devel +%endif BuildRequires: userspace-rcu-devel >= 0.7 %if ( 0%{?rhel} && 0%{?rhel} <= 6 ) BuildRequires: automake @@ -513,6 +521,10 @@ Requires: %{name}-cli%{?_isa} = %{version}-%{release} Requires: %{name}-libs%{?_isa} = %{version}-%{release} # some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse Requires: %{name}-fuse%{?_isa} = %{version}-%{release} +%if ( 0%{?_with_fbextras:1} ) +Requires: fb-libtirpc >= 0.2.5-1 +Requires: jemalloc >= 3.6.0-1 +%endif # self-heal daemon, rebalance, nfs-server etc. are actually clients Requires: %{name}-api%{?_isa} = %{version}-%{release} Requires: %{name}-client-xlators%{?_isa} = %{version}-%{release} @@ -600,7 +612,8 @@ export CFLAGS %{?_without_ocf} \ %{?_without_rdma} \ %{?_without_syslog} \ - %{?_without_tiering} + %{?_without_tiering} \ + %{?_with_fbextras} # fix hardening and remove rpath in shlibs %if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) @@ -807,6 +820,12 @@ fi %firewalld_reload %endif +%if ( 0%{?_with_fbextras:1} ) +if ! [ -f %{_sharedstatedir}/glusterd/glusterd.info ]; then + echo "UUID=$(/usr/bin/uuidgen)" >> %{_sharedstatedir}/glusterd/glusterd.info +fi +%endif + pidof -c -o %PPID -x glusterd &> /dev/null if [ $? -eq 0 ]; then kill -9 `pgrep -f gsyncd.py` &> /dev/null diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index c47fa3883c9..a7c96d1e7a0 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -1903,9 +1903,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, switch (event) { case RPC_CLNT_DISCONNECT: - GF_LOG_OCCASIONALLY (log_ctr1, "glusterfsd-mgmt", GF_LOG_ERROR, - "failed to connect with remote-host: %s (%s)", - ctx->cmd_args.volfile_server, strerror (errno)); + ctx->cmd_args.connect_attempts++; + + gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, + "Connect attempt with remote-host: %s (%s) (%u/%d)", + ctx->cmd_args.volfile_server, + strerror (errno), + ctx->cmd_args.connect_attempts, + ctx->cmd_args.max_connect_attempts); if (!rpc->disabled) { /* * Check if dnscache is exhausted for current server @@ -1916,8 +1921,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, break; } } + + /* If we run out of servers, AND we attempted to connect + * max connect times, then we should return ENOTCONN + */ server = ctx->cmd_args.curr_server; - if (server->list.next == &ctx->cmd_args.volfile_servers) { + if ((ctx->cmd_args.connect_attempts >= + ctx->cmd_args.max_connect_attempts) && + server->list.next == &ctx->cmd_args.volfile_servers) { if (!ctx->active) need_term = 1; emval = ENOTCONN; @@ -1926,24 +1937,33 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, "Exhausted all volfile servers"); break; } - server = list_entry (server->list.next, typeof(*server), list); - ctx->cmd_args.curr_server = server; - ctx->cmd_args.volfile_server = server->volfile_server; - - ret = dict_set_str (rpc_trans->options, "remote-host", - server->volfile_server); - if (ret != 0) { - gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, - "failed to set remote-host: %s", + + /* If we exceed the # of connect attempts, we should + * move onto the next server + */ + if (ctx->cmd_args.connect_attempts >= + ctx->cmd_args.max_connect_attempts || !server) { + server = list_entry (server->list.next, + typeof(*server), list); + ctx->cmd_args.curr_server = server; + ctx->cmd_args.volfile_server = server->volfile_server; + + ret = dict_set_str (rpc_trans->options, "remote-host", + server->volfile_server); + if (ret != 0) { + gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, + "failed to set remote-host: %s", + server->volfile_server); + if (!ctx->active) + need_term = 1; + emval = ENOTCONN; + break; + } + ctx->cmd_args.connect_attempts = 0; + gf_log ("glusterfsd-mgmt", GF_LOG_INFO, + "connecting to next volfile server %s", server->volfile_server); - if (!ctx->active) - need_term = 1; - emval = ENOTCONN; - break; } - gf_log ("glusterfsd-mgmt", GF_LOG_INFO, - "connecting to next volfile server %s", - server->volfile_server); break; case RPC_CLNT_CONNECT: rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn); @@ -1960,7 +1980,7 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, } } - + ctx->cmd_args.connect_attempts = 0; if (is_mgmt_rpc_reconnect) glusterfs_mgmt_pmap_signin (ctx); @@ -2120,6 +2140,7 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx) int ret = -1; int port = GF_DEFAULT_BASE_PORT; char *host = NULL; + char *addr_family = NULL; cmd_args = &ctx->cmd_args; GF_VALIDATE_OR_GOTO (THIS->name, cmd_args->volfile_server, out); @@ -2136,7 +2157,19 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx) !strcmp (cmd_args->volfile_server_transport, "unix")) { ret = rpc_transport_unix_options_build (&options, host, 0); } else { - ret = rpc_transport_inet_options_build (&options, host, port); + xlator_cmdline_option_t *cmd_option = NULL; + + list_for_each_entry (cmd_option, + &cmd_args->xlator_options, cmd_args) { + if (!strcmp(cmd_option->key, + "transport.address-family")) { + addr_family = cmd_option->value; + break; + } + } + + ret = rpc_transport_inet_options_build (&options, host, port, + addr_family); } if (ret) goto out; diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c index 6c7a7c883fa..5022cfc22da 100644 --- a/glusterfsd/src/glusterfsd.c +++ b/glusterfsd/src/glusterfsd.c @@ -986,7 +986,7 @@ parse_opts (int key, char *arg, struct argp_state *state) cmd_args->debug_mode = ENABLE_DEBUG_MODE; break; case ARGP_VOLFILE_MAX_FETCH_ATTEMPTS: - cmd_args->max_connect_attempts = 1; + cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS; break; case ARGP_DIRECT_IO_MODE_KEY: @@ -1955,13 +1955,7 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx) } } - /* - This option was made obsolete but parsing it for backward - compatibility with third party applications - */ - if (cmd_args->max_connect_attempts) { - gf_msg ("glusterfs", GF_LOG_WARNING, 0, glusterfsd_msg_33); - } + cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS; #ifdef GF_DARWIN_HOST_OS if (cmd_args->mount_point) diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h index e442bede5db..b5c6b27b534 100644 --- a/glusterfsd/src/glusterfsd.h +++ b/glusterfsd/src/glusterfsd.h @@ -16,7 +16,7 @@ #define DEFAULT_GLUSTERD_VOLFILE CONFDIR "/glusterd.vol" #define DEFAULT_CLIENT_VOLFILE CONFDIR "/glusterfs.vol" #define DEFAULT_SERVER_VOLFILE CONFDIR "/glusterfsd.vol" - +#define DEFAULT_MAX_CONNECT_ATTEMPTS 200 #define DEFAULT_EVENT_POOL_SIZE 16384 #define ARGP_LOG_LEVEL_NONE_OPTION "NONE" diff --git a/libglusterfs/src/client_t.c b/libglusterfs/src/client_t.c index 3e0e5936ae2..b3eb4e4df8c 100644 --- a/libglusterfs/src/client_t.c +++ b/libglusterfs/src/client_t.c @@ -366,6 +366,8 @@ client_destroy (client_t *client) } } GF_FREE (client->auth.data); + GF_FREE (client->auth.username); + GF_FREE (client->auth.passwd); GF_FREE (client->scratch_ctx.ctx); GF_FREE (client->client_uid); GF_FREE (client); diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index d7cd0ad015d..e533992556b 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -181,26 +181,16 @@ gf_rev_dns_lookup (const char *ip) { char *fqdn = NULL; int ret = 0; - struct sockaddr_in sa = {0}; - char host_addr[256] = {0, }; GF_VALIDATE_OR_GOTO ("resolver", ip, out); - sa.sin_family = AF_INET; - inet_pton (AF_INET, ip, &sa.sin_addr); - ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr, - sizeof (host_addr), NULL, 0, 0); - + /* Get the FQDN */ + ret = gf_get_hostname_from_ip ((char *)ip, &fqdn); if (ret != 0) { gf_msg ("resolver", GF_LOG_INFO, errno, LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve " "hostname for %s", ip); - goto out; } - - /* Get the FQDN */ - fqdn = gf_strdup (host_addr); - out: return fqdn; } @@ -3107,11 +3097,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) char *client_ip_copy = NULL; char *tmp = NULL; char *ip = NULL; + size_t addr_sz = 0; /* if ipv4, reverse lookup the hostname to * allow FQDN based rpc authentication */ - if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) { + if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) && + !valid_ipv4_address (client_ip, strlen (client_ip), 0)) { /* most times, we get a.b.c.d:port form, so check that */ client_ip_copy = gf_strdup (client_ip); if (!client_ip_copy) @@ -3124,12 +3116,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *)&client_sock_in; + addr_sz = sizeof (client_sock_in); client_sock_in.sin_family = AF_INET; ret = inet_pton (AF_INET, ip, (void *)&client_sock_in.sin_addr.s_addr); } else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *) &client_sock_in6; + addr_sz = sizeof (client_sock_in6); client_sock_in6.sin6_family = AF_INET6; ret = inet_pton (AF_INET6, ip, @@ -3143,8 +3137,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) goto out; } + /* You cannot just use sizeof (*client_sockaddr), as per the man page + * the (getnameinfo) size must be the size of the underlying sockaddr + * struct e.g. sockaddr_in6 or sockaddr_in. Failure to do so will + * break IPv6 hostname resolution (IPv4 will work only because + * the sockaddr_in struct happens to be of the correct size). + */ ret = getnameinfo (client_sockaddr, - sizeof (*client_sockaddr), + addr_sz, client_hostname, sizeof (client_hostname), NULL, 0, 0); if (ret) { diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index 51264237ab4..5e338f69528 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -642,7 +642,7 @@ gf_time_fmt (char *dst, size_t sz_dst, time_t utime, unsigned int fmt) if (timefmt_last == (gf_timefmts) - 1) _gf_timestuff (&timefmt_last, &fmts, &zeros); if (timefmt_last < fmt) fmt = gf_timefmt_default; - if (utime && gmtime_r (&utime, &tm) != NULL) { + if (utime && localtime_r (&utime, &tm) != NULL) { strftime (dst, sz_dst, fmts[fmt], &tm); } else { strncpy (dst, "N/A", sz_dst); diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h index fbaac76b9ee..771ed983d32 100644 --- a/libglusterfs/src/compat.h +++ b/libglusterfs/src/compat.h @@ -479,6 +479,12 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0); #endif +#ifdef GF_BSD_HOST_OS +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME +#endif + +#ifndef IPV6_DEFAULT + #ifndef IXDR_GET_LONG #define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf)) #endif @@ -495,6 +501,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG(buf, (long)(v)) #endif +#endif /* IPV6_DEFAULT */ + #if defined(__GNUC__) && !defined(RELAX_POISONING) /* Use run API, see run.h */ #include <stdlib.h> /* system(), mkostemp() */ diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 25ddff0d8c4..6a61e641e19 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -27,6 +27,45 @@ #include "statedump.h" #include "libglusterfs-messages.h" +/* this goes with the bucket_size lookup table below */ +#define NUM_DISTINCT_SIZES_32_BIT 32 + +/* this bucket_size lookup table is borrowed from GNU libstdc++ */ +static const uint32_t bucket_sizes[NUM_DISTINCT_SIZES_32_BIT] = { + /* 0 */ 5ul, + /* 1 */ 11ul, + /* 2 */ 23ul, + /* 3 */ 47ul, + /* 4 */ 97ul, + /* 5 */ 199ul, + /* 6 */ 409ul, + /* 7 */ 823ul, + /* 8 */ 1741ul, + /* 9 */ 3469ul, + /* 10 */ 6949ul, + /* 11 */ 14033ul, + /* 12 */ 28411ul, + /* 13 */ 57557ul, + /* 14 */ 116731ul, + /* 15 */ 236897ul, + /* 16 */ 480881ul, + /* 17 */ 976369ul, + /* 18 */ 1982627ul, + /* 19 */ 4026031ul, + /* 20 */ 8175383ul, + /* 21 */ 16601593ul, + /* 22 */ 33712729ul, + /* 23 */ 68460391ul, + /* 24 */ 139022417ul, + /* 25 */ 282312799ul, + /* 26 */ 573292817ul, + /* 27 */ 1164186217ul, + /* 28 */ 2364114217ul, + /* 29 */ 4294967291ul, + /* 30 */ 4294967291ul, + /* 31 */ 4294967291ul, +}; + struct dict_cmp { dict_t *dict; gf_boolean_t (*value_ignore) (char *k); @@ -47,7 +86,7 @@ get_new_data () } dict_t * -get_new_dict_full (int size_hint) +get_new_dict_full (uint32_t size_hint) { dict_t *dict = mem_get0 (THIS->ctx->dict_pool); @@ -67,17 +106,8 @@ get_new_dict_full (int size_hint) dict->members = &dict->members_internal; } else { - /* - * We actually need to allocate space for size_hint *pointers* - * but we actually allocate space for one *structure*. Since - * a data_pair_t consists of five pointers, we're wasting four - * pointers' worth for N=1, and will overrun what we allocated - * for N>5. If anybody ever starts using size_hint, we'll need - * to fix this. - */ - GF_ASSERT (size_hint <= - (sizeof(data_pair_t) / sizeof(data_pair_t *))); - dict->members = mem_get0 (THIS->ctx->dict_pair_pool); + dict->members = GF_CALLOC (size_hint, sizeof (data_pair_t *), + gf_common_mt_data_pair_t); if (!dict->members) { mem_put (dict); return NULL; @@ -108,6 +138,35 @@ dict_new (void) return dict; } +dict_t * +dict_new_by_size (uint32_t num) +{ + int32_t highest_bit = 0; + uint32_t bucket_size = 0; + dict_t *dict = NULL; + + if (num == 0) + goto out; + +#ifdef _GNU_SOURCE + highest_bit = 32 - __builtin_clz (num); +#else + while (num != 0) { + highest_bit++; + num >>= 1; + } +#endif + + bucket_size = bucket_sizes[highest_bit - 1]; + dict = get_new_dict_full (bucket_size); + + if (dict) + dict_ref (dict); + +out: + return dict; +} + int32_t is_data_equal (data_t *one, data_t *two) @@ -268,7 +327,7 @@ err_out: static data_pair_t * dict_lookup_common (dict_t *this, char *key) { - int hashval = 0; + uint32_t hashval = 0; if (!this || !key) { gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG, @@ -279,7 +338,7 @@ dict_lookup_common (dict_t *this, char *key) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) + if (this->hash_size > 1) hashval = SuperFastHash (key, strlen (key)) % this->hash_size; data_pair_t *pair; @@ -319,7 +378,7 @@ dict_lookup (dict_t *this, char *key, data_t **data) static int32_t dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace) { - int hashval = 0; + uint32_t hashval = 0; data_pair_t *pair; char key_free = 0; int tmp = 0; @@ -336,7 +395,7 @@ dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) { + if (this->hash_size > 1) { tmp = SuperFastHash (key, strlen (key)); hashval = (tmp % this->hash_size); } @@ -478,7 +537,7 @@ dict_get (dict_t *this, char *key) void dict_del (dict_t *this, char *key) { - int hashval = 0; + uint32_t hashval = 0; if (!this || !key) { gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL, @@ -491,7 +550,7 @@ dict_del (dict_t *this, char *key) /* If the divisor is 1, the modulo is always 0, * in such case avoid hash calculation. */ - if (this->hash_size != 1) + if (this->hash_size > 1) hashval = SuperFastHash (key, strlen (key)) % this->hash_size; data_pair_t *pair = this->members[hashval]; diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index c5b82677e2e..5259c6befa1 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -79,9 +79,9 @@ struct _data_pair { struct _dict { unsigned char is_static:1; - int32_t hash_size; - int32_t count; - int32_t refcount; + uint32_t hash_size; + uint32_t count; + uint32_t refcount; data_pair_t **members; data_pair_t *members_list; char *extra_free; @@ -156,9 +156,11 @@ void *data_to_ptr (data_t *data); data_t *get_new_data (); data_t * data_copy (data_t *old); -dict_t *get_new_dict_full (int size_hint); +dict_t *get_new_dict_full (uint32_t size_hint); dict_t *get_new_dict (); +#define dict_for_each(d, c) for (c = d->members_list; c; c = c->next) + int dict_foreach (dict_t *this, int (*fn)(dict_t *this, char *key, @@ -196,6 +198,7 @@ int dict_keys_join (void *value, int size, dict_t *dict, /* CLEANED UP FUNCTIONS DECLARATIONS */ GF_MUST_CHECK dict_t *dict_new (void); +GF_MUST_CHECK dict_t *dict_new_by_size (uint32_t num); dict_t *dict_copy_with_ref (dict_t *this, dict_t *new); GF_MUST_CHECK int dict_reset (dict_t *dict); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 6e2d370605b..59f3df19420 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -78,6 +78,7 @@ #define ZR_STRICT_VOLFILE_CHECK "strict-volfile-check" #define ZR_DUMP_FUSE "dump-fuse" #define ZR_FUSE_MOUNTOPTS "fuse-mountopts" +#define IO_THREADS_QUEUE_SIZE_KEY "io-thread-queue-size" #define GF_XATTR_CLRLK_CMD "glusterfs.clrlk" #define GF_XATTR_PATHINFO_KEY "trusted.glusterfs.pathinfo" @@ -283,6 +284,51 @@ #define GF_LK_ADVISORY 0 #define GF_LK_MANDATORY 1 +#define GF_CHECK_XATTR_KEY_AND_GOTO(key, cmpkey, errval, lbl) \ + do { \ + if (key && strcmp (key, cmpkey) == 0) { \ + errval = -EINVAL; \ + goto lbl; \ + } \ + } while (0); \ + + +typedef enum { + GF_FOP_PRI_UNSPEC = -1, /* Priority not specified */ + GF_FOP_PRI_HI = 0, /* low latency */ + GF_FOP_PRI_NORMAL, /* normal */ + GF_FOP_PRI_LO, /* bulk */ + GF_FOP_PRI_LEAST, /* least */ + GF_FOP_PRI_MAX, +} gf_fop_pri_t; + +/* For backwards compatibility in io-threads */ +typedef gf_fop_pri_t iot_pri_t; +#define IOT_PRI_UNSPEC GF_FOP_PRI_UNSPEC +#define IOT_PRI_HI GF_FOP_PRI_HI +#define IOT_PRI_NORMAL GF_FOP_PRI_NORMAL +#define IOT_PRI_LO GF_FOP_PRI_LO +#define IOT_PRI_LEAST GF_FOP_PRI_LEAST +#define IOT_PRI_MAX GF_FOP_PRI_MAX + +static const char* FOP_PRI_STRINGS[] = { + "HIGH", + "NORMAL", + "LOW", + "LEAST" +}; + +static inline const char *fop_pri_to_string (gf_fop_pri_t pri) +{ + if (pri < 0) + return "UNSPEC"; + + if (pri >= GF_FOP_PRI_MAX) + return "INVALID"; + + return FOP_PRI_STRINGS[pri]; +} + const char *fop_enum_to_pri_string (glusterfs_fop_t fop); const char *fop_enum_to_string (glusterfs_fop_t fop); @@ -330,6 +376,7 @@ struct _cmd_args { uint32_t log_buf_size; uint32_t log_flush_timeout; int32_t max_connect_attempts; + unsigned int connect_attempts; char *print_exports; char *print_netgroups; /* advanced options */ diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c index 17cd68fc206..fa3ac840c43 100644 --- a/libglusterfs/src/iobuf.c +++ b/libglusterfs/src/iobuf.c @@ -30,8 +30,8 @@ struct iobuf_init_config gf_iobuf_init_config[] = { {8 * 1024, 128}, {32 * 1024, 64}, {128 * 1024, 32}, - {256 * 1024, 8}, - {1 * 1024 * 1024, 2}, + {256 * 1024, 64}, + {1 * 1024 * 1024, 64}, }; int diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c index 611615949fa..d51e64768aa 100644 --- a/libglusterfs/src/latency.c +++ b/libglusterfs/src/latency.c @@ -21,6 +21,7 @@ #include "statedump.h" #include "libglusterfs-messages.h" +static int gf_set_fop_from_fn_pointer_warning; void gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn) { @@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void fop = GF_FOP_READDIRP; else if (fops->getspec == *(fop_getspec_t *)&fn) fop = GF_FOP_GETSPEC; - else - fop = -1; + else if (fops->ipc == *(fop_ipc_t *)&fn) + fop = GF_FOP_IPC; + else { + fop = GF_FOP_NULL; + GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning, + "latency", + GF_LOG_WARNING, + "Unknown FOP type"); + } frame->op = fop; } @@ -129,6 +137,13 @@ gf_update_latency (call_frame_t *frame) elapsed = (end->tv_sec - begin->tv_sec) * 1e6 + (end->tv_usec - begin->tv_usec); + if (frame->op < 0 || frame->op >= GF_FOP_MAXVALUE) { + gf_log ("[core]", GF_LOG_WARNING, + "Invalid frame op value: %d", + frame->op); + return; + } + lat = &frame->this->latencies[frame->op]; lat->total += elapsed; diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c index 88fbdf58319..d189be7960e 100644 --- a/libglusterfs/src/mem-pool.c +++ b/libglusterfs/src/mem-pool.c @@ -454,6 +454,10 @@ mem_get0 (struct mem_pool *mem_pool) void * mem_get (struct mem_pool *mem_pool) { +#ifdef DISABLE_MEMPOOL + return GF_CALLOC (1, mem_pool->real_sizeof_type, + gf_common_mt_mem_pool); +#else struct list_head *list = NULL; void *ptr = NULL; int *in_use = NULL; @@ -525,9 +529,11 @@ fwd_addr_out: UNLOCK (&mem_pool->lock); return ptr; +#endif /* DISABLE_MEMPOOL */ } +#ifndef DISABLE_MEMPOOL static int __is_member (struct mem_pool *pool, void *ptr) { @@ -546,11 +552,16 @@ __is_member (struct mem_pool *pool, void *ptr) return 1; } +#endif void mem_put (void *ptr) { +#ifdef DISABLE_MEMPOOL + GF_FREE (ptr); + return; +#else struct list_head *list = NULL; int *in_use = NULL; void *head = NULL; @@ -628,6 +639,7 @@ mem_put (void *ptr) } } UNLOCK (&pool->lock); +#endif /* DISABLE_MEMPOOL */ } void diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index afa52d8bc45..fc7bf9e5996 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -168,6 +168,7 @@ enum gf_common_mem_types_ { /*lock migration*/ gf_common_mt_lock_mig, gf_common_mt_pthread_t, + gf_common_ping_local_t, gf_common_mt_end }; #endif diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c index f7b2bea2f30..903303d1380 100644 --- a/libglusterfs/src/timespec.c +++ b/libglusterfs/src/timespec.c @@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta) ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000); ts->tv_sec += delta.tv_sec; } + +void timespec_sub (const struct timespec *begin, const struct timespec *end, + struct timespec *res) +{ + if (end->tv_nsec < begin->tv_nsec) { + res->tv_sec = end->tv_sec - begin->tv_sec - 1; + res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec; + } else { + res->tv_sec = end->tv_sec - begin->tv_sec; + res->tv_nsec = end->tv_nsec - begin->tv_nsec; + } +} diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h index f37194b97cf..9c393ee7166 100644 --- a/libglusterfs/src/timespec.h +++ b/libglusterfs/src/timespec.h @@ -20,5 +20,8 @@ void timespec_now (struct timespec *ts); void timespec_adjust_delta (struct timespec *ts, struct timespec delta); +void timespec_sub (const struct timespec *begin, + const struct timespec *end, + struct timespec *res); #endif /* __INCLUDE_TIMESPEC_H__ */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 3c1cde50fa0..b2529d3c4f7 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -117,6 +117,14 @@ out: } +static const char *xlator_lib_path (void) +{ + const char *libdir_env = getenv ("GLUSTER_LIBDIR"); + + return libdir_env ? libdir_env : XLATORDIR; +} + + int xlator_volopt_dynload (char *xlator_type, void **dl_handle, volume_opt_list_t *opt_list) @@ -130,9 +138,11 @@ xlator_volopt_dynload (char *xlator_type, void **dl_handle, /* socket.so doesn't fall under the default xlator directory, hence we * need this check */ if (!strstr(xlator_type, "rpc-transport")) - ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xlator_type); + ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), + xlator_type); else - ret = gf_asprintf (&name, "%s/%s.so", XLATORPARENTDIR, xlator_type); + ret = gf_asprintf (&name, "%s/../%s.so", xlator_lib_path (), + xlator_type); if (-1 == ret) { goto out; } @@ -183,7 +193,7 @@ xlator_dynload (xlator_t *xl) INIT_LIST_HEAD (&xl->volume_options); - ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xl->type); + ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), xl->type); if (-1 == ret) { goto out; } diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index 70e6f0a108d..2e04893c487 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -927,6 +927,7 @@ struct _xlator { gf_loglevel_t loglevel; /* Log level for translator */ + fop_latency_t client_latency; /* for latency measurement */ fop_latency_t latencies[GF_FOP_MAXVALUE]; @@ -17,7 +17,7 @@ done shift $((OPTIND-1)) -branch="release-3.8"; +branch="release-3.8-fb"; set_hooks_commit_msg() { diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index a7ff866ac99..7ce066dec5f 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -18,6 +18,7 @@ #include "mem-pool.h" #include "xdr-rpc.h" #include "rpc-common-xdr.h" +#include "timespec.h" char *clnt_ping_procs[GF_DUMP_MAXVALUE] = { @@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = { .procnames = clnt_ping_procs, }; +struct ping_local { + struct rpc_clnt *rpc; + struct timespec submit_time; +}; + /* Must be called under conn->lock */ static int __rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk) @@ -166,16 +172,48 @@ out: return; } +void +_update_client_latency (const rpc_clnt_connection_t *conn, + call_frame_t *frame, + uint64_t elapsed_usec) +{ + fop_latency_t *lat; + + lat = &frame->this->client_latency; + + if (elapsed_usec < lat->min) { + lat->min = elapsed_usec; + } + + if (elapsed_usec > lat->max) { + lat->max = elapsed_usec; + } + + lat->total += elapsed_usec; + lat->count++; + lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count; + gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, " + "avg: %0.6lf ms, count:%ld", + conn->trans->peerinfo.identifier, elapsed_usec / 1000.0, + lat->mean / 1000.0, lat->count); +} + int rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { - struct rpc_clnt *rpc = NULL; + struct ping_local *local = NULL; xlator_t *this = NULL; rpc_clnt_connection_t *conn = NULL; + call_frame_t *frame = NULL; struct timespec timeout = {0, }; + struct timespec now; + struct timespec delta; + int64_t latency_usec = 0; + int ret = 0; int unref = 0; + gf_boolean_t call_notify = _gf_false; if (!myframe) { gf_log (THIS->name, GF_LOG_WARNING, @@ -185,14 +223,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, frame = myframe; this = frame->this; - rpc = frame->local; - frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */ - conn = &rpc->conn; + local = frame->local; + conn = &local->rpc->conn; pthread_mutex_lock (&conn->lock); { if (req->rpc_status == -1) { - unref = rpc_clnt_remove_ping_timer_locked (rpc); + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (unref) { gf_log (this->name, GF_LOG_WARNING, "socket or ib related error"); @@ -207,8 +244,15 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, goto unlock; } - unref = rpc_clnt_remove_ping_timer_locked (rpc); - if (__rpc_clnt_rearm_ping_timer (rpc, + timespec_now (&now); + timespec_sub (&local->submit_time, &now, &delta); + latency_usec = delta.tv_sec * 1000000UL + + delta.tv_nsec / 1000UL; + + _update_client_latency (conn, frame, latency_usec); + call_notify = _gf_true; + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); + if (__rpc_clnt_rearm_ping_timer (local->rpc, rpc_clnt_start_ping) == -1) { gf_log (this->name, GF_LOG_WARNING, "failed to set the ping timer"); @@ -217,12 +261,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, } unlock: pthread_mutex_unlock (&conn->lock); + + if (call_notify) { + ret = local->rpc->notifyfn (local->rpc, this, + RPC_CLNT_PING, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "RPC_CLNT_PING notify failed"); + } + } out: if (unref) - rpc_clnt_unref (rpc); + rpc_clnt_unref (local->rpc); - if (frame) + if (frame) { + GF_FREE (frame->local); + frame->local = NULL; STACK_DESTROY (frame->root); + } return 0; } @@ -232,18 +288,27 @@ rpc_clnt_ping (struct rpc_clnt *rpc) call_frame_t *frame = NULL; int32_t ret = -1; rpc_clnt_connection_t *conn = NULL; + struct ping_local *local = NULL; conn = &rpc->conn; + local = GF_MALLOC (sizeof(struct ping_local), gf_common_ping_local_t); + if (!local) + return ret; frame = create_frame (THIS, THIS->ctx->pool); - if (!frame) + if (!frame) { + GF_FREE (local); return ret; + } - frame->local = rpc; + local->rpc = rpc; + timespec_now (&local->submit_time); + frame->local = local; ret = rpc_clnt_submit (rpc, &clnt_ping_prog, GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0, NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL); if (ret) { + /* FIXME: should we free the frame here? Methinks so! */ gf_log (THIS->name, GF_LOG_ERROR, "failed to start ping timer"); } diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.h b/rpc/rpc-lib/src/rpc-clnt-ping.h index d92e5054190..7354679d50f 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.h +++ b/rpc/rpc-lib/src/rpc-clnt-ping.h @@ -9,7 +9,7 @@ */ -#define RPC_DEFAULT_PING_TIMEOUT 30 +#define RPC_DEFAULT_PING_TIMEOUT 300 void rpc_clnt_check_and_start_ping (struct rpc_clnt *rpc_ptr); int diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index d3df5560a8b..406efdb2d4f 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -21,6 +21,8 @@ #include "xdr-rpc.h" #include "rpc-common-xdr.h" +#pragma GCC diagnostic ignored "-Wformat=" + void rpc_clnt_reply_deinit (struct rpc_req *req, struct mem_pool *pool); @@ -122,6 +124,7 @@ call_bail (void *data) struct iovec iov = {0,}; char peerid[UNIX_PATH_MAX] = {0}; gf_boolean_t need_unref = _gf_false; + gf_boolean_t timedout_frames = _gf_false; GF_VALIDATE_OR_GOTO ("client", data, out); @@ -198,7 +201,6 @@ call_bail (void *data) "--", trav->rpcreq->procnum, trav->rpcreq->xid, frame_sent, conn->frame_timeout, peerid); - clnt = rpc_clnt_ref (clnt); trav->rpcreq->rpc_status = -1; trav->rpcreq->cbkfn (trav->rpcreq, &iov, 1, trav->frame); @@ -207,7 +209,30 @@ call_bail (void *data) clnt = rpc_clnt_unref (clnt); list_del_init (&trav->list); mem_put (trav); - } + timedout_frames = _gf_true; + } + /* So what on earth is this you ask? It was observed while testing + * the SHD threading code, that under high loads SHD/AFR related + * SyncOps & SyncTasks can actually hang/deadlock as the transport + * disconnected event never gets bubbled up correctly. Various + * tests indicated the ping timeouts worked fine, while "frame timeouts" + * did not. The only difference? Ping timeouts actually disconnect + * the transport while frame timeouts did not. So from a high-level we + * know this prevents deadlock as subsequent tests showed the deadlocks + * no longer ocurred (after this change). That said, there may be some + * more elegant solution. For now though, forcing a reconnect is + * preferential vs hanging clients or deadlocking the SHD. + * + * I suspect the culprit might be in + * afr-self-heal-common.c:afr_sh_common_lookup_cbk as this function + * will early-return if the callcount never actually reaches 0, + * which ordinarily is fine (you only want your callback called if + * the Nth response is received), but what happens if callcount + * never rearches 0? The callback won't be called. Theory at this + * point, but a good spot to start when we get a chance. + */ + if (timedout_frames) + rpc_transport_disconnect (clnt->conn.trans); out: rpc_clnt_unref (clnt); if (need_unref) diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index df19a0c403f..5ad4fd42298 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -19,6 +19,7 @@ typedef enum { RPC_CLNT_CONNECT, RPC_CLNT_DISCONNECT, + RPC_CLNT_PING, RPC_CLNT_MSG, RPC_CLNT_DESTROY } rpc_clnt_event_t; diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c index e224dcc022e..5556740ca81 100644 --- a/rpc/rpc-lib/src/rpc-transport.c +++ b/rpc/rpc-lib/src/rpc-transport.c @@ -166,6 +166,19 @@ out: +int rpc_transport_lib_path (char **name, char *type) +{ + int ret = -1; + char *libdir_env = getenv ("GLUSTER_LIBDIR"); + + ret = libdir_env == NULL + ? gf_asprintf (name, "%s/%s.so", RPC_TRANSPORTDIR, type) + : gf_asprintf (name, "%s/rpc-transport/%s.so", libdir_env, type); + return ret; +} + + + rpc_transport_t * rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name) { @@ -274,7 +287,7 @@ rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name) goto fail; } - ret = gf_asprintf (&name, "%s/%s.so", RPC_TRANSPORTDIR, type); + ret = rpc_transport_lib_path (&name, type); if (-1 == ret) { goto fail; } @@ -652,18 +665,37 @@ out: return ret; } +/** @brief build a dictionary containing basic transport options. + * + * @param[out] options: will be set to a newly created dictionary on success. + * @param[in] hostname: desired target hostname. + * @param[in] port: desired target port. + * @param[in] addr_family (optional): desired address family. If NULL, + * default will be used. + * + * @returns zero on success. + */ int rpc_transport_inet_options_build (dict_t **options, const char *hostname, - int port) + int port, const char *addr_family) { dict_t *dict = NULL; char *host = NULL; int ret = -1; +#ifdef IPV6_DEFAULT + const char *addr_family_default = "inet6"; +#else + const char *addr_family_default = "inet"; +#endif GF_ASSERT (options); GF_ASSERT (hostname); GF_ASSERT (port >= 1024); + if (!addr_family) { + addr_family = addr_family_default; + } + dict = dict_new (); if (!dict) goto out; @@ -688,6 +720,14 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, goto out; } + ret = dict_set_str (dict, "transport.address-family", + (char *)addr_family); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set address-family to %s", addr_family); + goto out; + } + ret = dict_set_str (dict, "transport-type", "socket"); if (ret) { gf_log (THIS->name, GF_LOG_WARNING, diff --git a/rpc/rpc-lib/src/rpc-transport.h b/rpc/rpc-lib/src/rpc-transport.h index f0add065065..0f555462ea4 100644 --- a/rpc/rpc-lib/src/rpc-transport.h +++ b/rpc/rpc-lib/src/rpc-transport.h @@ -311,5 +311,6 @@ rpc_transport_unix_options_build (dict_t **options, char *filepath, int frame_timeout); int -rpc_transport_inet_options_build (dict_t **options, const char *hostname, int port); +rpc_transport_inet_options_build (dict_t **options, const char *hostname, + int port, const char *addr_family); #endif /* __RPC_TRANSPORT_H__ */ diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c index 5a5c65114c4..bc661043674 100644 --- a/rpc/rpc-lib/src/rpcsvc.c +++ b/rpc/rpc-lib/src/rpcsvc.c @@ -37,9 +37,15 @@ #include <stdarg.h> #include <stdio.h> +#ifdef IPV6_DEFAULT +#include <netconfig.h> +#endif + #include "xdr-rpcclnt.h" #include "glusterfs-acl.h" +#pragma GCC diagnostic ignored "-Wformat=" + struct rpcsvc_program gluster_dump_prog; #define rpcsvc_alloc_request(svc, request) \ @@ -1392,6 +1398,90 @@ rpcsvc_error_reply (rpcsvc_request_t *req) return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL); } +#ifdef IPV6_DEFAULT +int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port, gf_boolean_t unregister_first) +{ + const int IP_BUF_LEN = 64; + char addr_buf[IP_BUF_LEN]; + + int err = 0; + bool_t success = 0; + struct netconfig *nc; + struct netbuf *nb; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + + err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff, + port & 0xff); + if (err < 0) { + err = -1; + goto out; + } + + nb = uaddr2taddr (nc, addr_buf); + if (!nb) { + err = -1; + goto out; + } + + if (unregister_first) { + /* Force the unregistration of the program first. + * This call may fail if nothing has been registered, + * which is fine. + */ + rpcsvc_program_unregister_rpcbind6 (newprog); + } + + success = rpcb_set (newprog->prognum, newprog->progver, nc, nb); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6" + " service with rpcbind"); + } + + err = 0; + +out: + return err; +} + +int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog) +{ + int err = 0; + bool_t success = 0; + struct netconfig *nc; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + success = rpcb_unset (newprog->prognum, newprog->progver, nc); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6" + " service with rpcbind"); + } + + err = 0; +out: + return err; +} +#endif /* Register the program with the local portmapper service. */ int @@ -1556,7 +1646,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) " program failed"); goto out; } - +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_unregister_rpcbind6 (program); + if (ret == -1) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)" + " unregistration of program failed"); + goto out; + } +#endif pthread_mutex_lock (&svc->rpclock); { list_for_each_entry (prog, &svc->programs, program) { diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h index 08402373be6..17e72482531 100644 --- a/rpc/rpc-lib/src/rpcsvc.h +++ b/rpc/rpc-lib/src/rpcsvc.h @@ -437,6 +437,13 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener); extern int rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port); +#ifdef IPV6_DEFAULT +extern int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port, gf_boolean_t unregister_first); +extern int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog); +#endif + extern int rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog); diff --git a/rpc/rpc-lib/src/xdr-common.h b/rpc/rpc-lib/src/xdr-common.h index 596ac99640f..211e33272ba 100644 --- a/rpc/rpc-lib/src/xdr-common.h +++ b/rpc/rpc-lib/src/xdr-common.h @@ -105,4 +105,11 @@ unsigned long xdr_sizeof (xdrproc_t func, void *data); #define xdr_decoded_length(xdr) (((size_t)(&xdr)->x_private) - ((size_t)(&xdr)->x_base)) +/* + * The TIRPC headers rather annoyingly don't define this, even though it + * actually exists. + */ +extern u_long xdr_sizeof (xdrproc_t freebsd_compiler_is_broken, + void *so_is_net_bsd); + #endif diff --git a/rpc/rpc-transport/rdma/src/name.c b/rpc/rpc-transport/rdma/src/name.c index 8003b1c87a0..b9d3269eb73 100644 --- a/rpc/rpc-transport/rdma/src/name.c +++ b/rpc/rpc-transport/rdma/src/name.c @@ -54,6 +54,10 @@ af_inet_bind_to_port_lt_ceiling (struct rdma_cm_id *cm_id, struct sockaddr *sockaddr, socklen_t sockaddr_len, uint32_t ceiling) { +#if defined(NO_PRIVPORT) + _assign_port(sockaddr, 0); + return rdma_bind_addr (cm_id, sockaddr); +#else int32_t ret = -1; uint16_t port = ceiling - 1; gf_boolean_t ports[GF_PORT_MAX]; @@ -100,6 +104,7 @@ loop: } return ret; +#endif /* NO_PRIVPORT */ } #if 0 diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c index 0e34dc211fe..cab4161c076 100644 --- a/rpc/rpc-transport/socket/src/name.c +++ b/rpc/rpc-transport/socket/src/name.c @@ -42,6 +42,10 @@ static int32_t af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, socklen_t sockaddr_len, uint32_t ceiling) { +#if defined(NO_PRIVPORT) + _assign_port(sockaddr, 0); + return bind (fd, sockaddr, sockaddr_len); +#else int32_t ret = -1; uint16_t port = ceiling - 1; gf_boolean_t ports[GF_PORT_MAX]; @@ -88,6 +92,7 @@ loop: } return ret; +#endif /* NO_PRIVPORT */ } static int32_t @@ -557,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) data_t *address_family_data = NULL; int32_t ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; + sa_family_t default_family = AF_INET6; +#else + char *addr_family = "inet"; + sa_family_t default_family = AF_INET; +#endif + GF_VALIDATE_OR_GOTO ("socket", sa_family, out); address_family_data = dict_get (this->options, @@ -581,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) } } else { gf_log (this->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting to inet"); - *sa_family = AF_INET; + "option address-family not specified, " + "defaulting to %s", addr_family); + *sa_family = default_family; } ret = 0; diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index ae551dcfae7..40a25bdba83 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -38,6 +38,7 @@ #include <errno.h> #include <rpc/xdr.h> #include <sys/ioctl.h> + #define GF_LOG_ERRNO(errno) ((errno == ENOTCONN) ? GF_LOG_DEBUG : GF_LOG_ERROR) #define SA(ptr) ((struct sockaddr *)ptr) @@ -55,7 +56,11 @@ /* TBD: do automake substitutions etc. (ick) to set these. */ #if !defined(DEFAULT_ETC_SSL) # ifdef GF_LINUX_HOST_OS +# ifdef GF_FBEXTRAS +# define DEFAULT_ETC_SSL "/var/lib/glusterd/ssl" +# else # define DEFAULT_ETC_SSL "/etc/ssl" +# endif # endif # ifdef GF_BSD_HOST_OS # define DEFAULT_ETC_SSL "/etc/openssl" @@ -866,7 +871,7 @@ __socket_keepalive (int fd, int family, int keepalive_intvl, goto err; } #else - if (family != AF_INET) + if (family != AF_INET && family != AF_INET6) goto done; ret = setsockopt (fd, IPPROTO_TCP, TCP_KEEPIDLE, &keepalive_idle, @@ -3009,6 +3014,21 @@ socket_connect (rpc_transport_t *this, int port) } } + /* Make sure we are not vulnerable to someone setting + * net.ipv6.bindv6only to 1 so that gluster services are + * avalable over IPv4 & IPv6. + */ + int disable_v6only = 0; + + if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&disable_v6only, + sizeof (disable_v6only)) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Error disabling sockopt IPV6_V6ONLY: \"%s\"", + strerror (errno)); + } + + if (priv->nodelay && (sa_family != AF_UNIX)) { ret = __socket_nodelay (priv->sock); diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x index 8462dcc258a..5ec8109d828 100644 --- a/rpc/xdr/src/glusterfs-fops.x +++ b/rpc/xdr/src/glusterfs-fops.x @@ -84,6 +84,7 @@ enum glusterfs_event_t { GF_EVENT_UPCALL, GF_EVENT_SCRUB_STATUS, GF_EVENT_SOME_CHILD_DOWN, + GF_EVENT_CHILD_PING, GF_EVENT_MAXVAL }; diff --git a/run-tests.sh b/run-tests.sh index 1487f30d832..866ab0464b4 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -182,12 +182,14 @@ function get_test_status () # for later. Why does the key have the distro and version then? # Because changing the key in all test files would be very big process # updating just this function with a better logic much simpler. + # + # FB Edit: For FB tests we are disabling NetBSD testing. + # Linux) result=$(grep -e "^#G_TESTDEF_TEST_STATUS_CENTOS6" $test_name | \ awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;; NetBSD) - result=$(grep -e "^#G_TESTDEF_TEST_STATUS_NETBSD7" $test_name | \ - awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;; + result="KNOWN_ISSUE" ;; *) result="ENABLED" ;; esac diff --git a/site.h.in b/site.h.in new file mode 100644 index 00000000000..d917d78e59b --- /dev/null +++ b/site.h.in @@ -0,0 +1,27 @@ +/* + * Guidelines for using this file vs. configure.ac + * + * (1) If it already exists in configure.ac, leave it there. + * + * (2) If it needs to take effect at configure (not compile) time, it *needs* + * to go in configure.ac. + * + * (3) If it affects file paths, which are the things most likely to be based + * on an OS or distribution's generic filesystem hierarchy and not on a + * particular package's definition (e.g. an RPM specfile), it should probably + * go in configure.ac. + * + * (4) If it affects default sizes, limits, thresholds, or modes of operation + * (e.g. IPv4 vs. IPv6), it should probably go here. + * + * (5) For anything else, is it more like the things in 3 or the things in 4? + * Which approach is more convenient for the people who are likely to use the + * new option(s)? Make your best guesses, confirm with others, and go with + * what works. + */ + +/* + * This is just an example, and a way to check whether site.h is actually being + * included automatically. + */ +#define SITE_DOT_H_TEST 9987 diff --git a/test_env b/test_env new file mode 100644 index 00000000000..2e6c33c9e6a --- /dev/null +++ b/test_env @@ -0,0 +1,165 @@ +#!/bin/bash + +DESIRED_TESTS="\ + tests/basic/*.t\ + tests/basic/afr/*.t\ + tests/basic/distribute/*.t\ + tests/features/brick-min-free-space.t\ +" + +KNOWN_FLAKY_TESTS="\ + tests/bugs/glusterd/bug-1173414-mgmt-v3-remote-lock-failure.t\ + tests/bugs/glusterd/bug-1420637-volume-sync-fix.t\ + tests/bugs/glusterd/bug-1104642.t\ + tests/bugs/glusterd/bug-1022055.t\ + tests/bugs/glusterd/bug-1293414-import-brickinfo-uuid.t\ + tests/bugs/transport/bug-873367.t\ + tests/bugs/ec/bug-1161621.t\ + tests/bugs/quota/bug-1287996.t\ + tests/bugs/fb8149516.t\ + tests/bugs/posix/bug-990028.t\ + tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t\ + tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t\ + tests/bugs/write-behind/bug-1279730.t\ + tests/bugs/cli/bug-1320388.t\ + tests/bugs/snapshot/bug-1112613.t\ + tests/bugs/snapshot/bug-1087203.t\ + tests/bugs/snapshot/bug-1202436-calculate-quota-cksum-during-snap-restore.t\ + tests/bugs/snapshot/bug-1205592.t\ + tests/bugs/snapshot/bug-1140162-file-snapshot-features-encrypt-opts-validation.t\ + tests/bugs/glusterd/bug-1231437-rebalance-test-in-cluster.t\ + tests/bugs/snapshot/bug-1049834.t\ + tests/bugs/shard/zero-flag.t\ + tests/bugs/bitrot/1207029-bitrot-daemon-should-start-on-valid-node.t\ + tests/bugs/bitrot/1209752-volume-status-should-show-bitrot-scrub-info.t\ + tests/bugs/snapshot/bug-1399598-uss-with-ssl.t\ + tests/bugs/tier/bug-1279376-rename-demoted-file.t\ + tests/bugs/tier/bug-1286974.t\ + tests/bugs/tier/bug-1205545-CTR-and-trash-integration.t\ + tests/features/ipc.t\ + tests/features/ssl-authz.t\ + tests/bugs/glusterd/bug-948686.t\ + tests/bugs/core/bug-986429.t\ + tests/bugs/fb4482137.t\ + tests/bugs/glusterd/bug-913555.t\ + tests/basic/rpm.t\ + tests/basic/accept-v6v4.t\ + tests/basic/afr/granular-esh/granular-esh.t\ + tests/basic/afr/granular-esh/cli.t\ + tests/basic/afr/granular-esh/granular-indices-but-non-granular-heal.t\ + tests/basic/afr/granular-esh/conservative-merge.t\ + tests/basic/afr/granular-esh/add-brick.t\ + tests/basic/afr/granular-esh/replace-brick.t\ + tests/basic/bd.t tests/basic/uss.t\ + tests/basic/glusterd/arbiter-volume-probe.t\ + tests/basic/meta.t\ + tests/basic/gfapi/bug1291259.t\ + tests/basic/gfapi/gfapi-ssl-test.t\ + tests/basic/tier/bug-1214222-directories_missing_after_attach_tier.t\ + tests/basic/tier/record-metadata-heat.t\ + tests/basic/tier/readdir-during-migration.t\ + tests/basic/fops-sanity-gfproxy.t\ + tests/basic/tier/ctr-rename-overwrite.t\ + tests/basic/tier/frequency-counters.t\ + tests/basic/mgmt_v3-locks.t\ + tests/basic/tier/file_with_spaces.t\ + tests/basic/tier/tier_lookup_heal.t\ + tests/basic/glusterd/volfile_server_switch.t\ + tests/basic/tier/tier-file-create.t\ + tests/basic/tier/locked_file_migration.t\ + tests/basic/tier/tier-snapshot.t\ + tests/basic/volume-snapshot.t\ + tests/basic/tier/new-tier-cmds.t\ + tests/basic/quota-nfs.t\ + tests/geo-rep/georep-basic-dr-rsync.t\ + tests/basic/tier/unlink-during-migration.t\ + tests/basic/tier/fops-during-migration-pause.t\ + tests/basic/volume-snapshot-clone.t\ + tests/bugs/nfs/bug-1166862.t\ + tests/basic/tier/legacy-many.t\ + tests/bugs/nfs/bug-1116503.t\ + tests/bugs/nfs/bug-904065.t\ + tests/bugs/rpc/bug-921072.t\ + tests/bugs/rpc/bug-847624.t\ + tests/bugs/glusterfs-server/bug-904300.t\ + tests/bugs/replicate/886998/strict-readdir.t\ + tests/basic/tier/fops-during-migration.t\ + tests/basic/tier/tierd_check.t\ + tests/basic/tier/tier.t\ + tests/bugs/replicate/bug-1250170-fsync.t\ + tests/basic/cache.t\ + tests/geo-rep/georep-basic-dr-tarssh.t\ + tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t\ + tests/bitrot/bug-1294786.t\ + tests/bugs/quick-read/bug-846240.t\ + tests/bugs/quota/afr-quota-xattr-mdata-heal.t\ + tests/bugs/quota/bug-1288474.t\ + tests/bugs/glusterd/bug-1344407-volume-delete-on-node-down.t\ + tests/bugs/glusterd/859927/repl.t\ + tests/bugs/glusterd/bug-1238706-daemons-stop-on-peer-cleanup.t\ + tests/bugs/replicate/bug-1290965-detect-bitrotten-objects.t\ + tests/bugs/glusterd/bug-1303028-Rebalance-glusterd-rpc-connection-issue.t\ + tests/bugs/replicate/bug-859581.t\ + tests/bugs/glusterd/bug-1047955.t\ + tests/bugs/glusterd/bug-1213295-snapd-svc-uninitialized.t\ + tests/bugs/glusterd/bug-1260185-donot-allow-detach-commit-unnecessarily.t\ + tests/bugs/glusterd/bug-1230121-replica_subvol_count_correct_cal.t\ + tests/bugs/glusterd/bug-1245045-remove-brick-validation.t\ + tests/bugs/glusterd/bug-948729/bug-948729.t\ + tests/bugs/glusterd/bug-948729/bug-948729-force.t\ + tests/bugs/glusterd/bug-948729/bug-948729-mode-script.t\ + tests/bugs/glusterd/bug-964059.t\ + tests/bugs/glusterd/bug-888752.t\ + tests/bugs/glusterd/bug-1177132-quorum-validation.t\ + tests/bugs/glusterd/bug-889630.t\ + tests/bugs/glusterd/bug-857330/xml.t\ + tests/bugs/glusterd/bug-857330/normal.t + tests/bugs/glusterd/bug-1367478-volume-start-validation-after-glusterd-restart.t\ + tests/bugs/glusterd/bug-1223213-peerid-fix.t\ + tests/bugs/glusterd/bug-1245142-rebalance_test.t\ + tests/bugs/glusterd/bug-1091935-brick-order-check-from-cli-to-glusterd.t\ + tests/bugs/glusterd/bug-1323287-real_path-handshake-test.t\ + tests/bugs/glusterd/bug-1266818-shared-storage-disable.t\ + tests/bugs/replicate/bug-802417.t\ + tests/bugs/glusterd/bug-1173414-mgmt-v3-remote-lock-failure.t\ + tests/bugs/glusterd/bug-1420637-volume-sync-fix.t\ + tests/bugs/glusterd/bug-1104642.t\ + tests/bugs/glusterd/bug-1293414-import-brickinfo-uuid.t\ + tests/bugs/glusterd/bug-1022055.t\ + tests/bugs/transport/bug-873367.t\ + tests/bugs/quota/bug-1287996.t\ + tests/bugs/fb8149516.t\ + tests/bugs/posix/bug-990028.t\ + tests/bugs/glusterd/bug-1345727-bricks-stop-on-no-quorum-validation.t\ + tests/bugs/core/bug-1402841.t-mt-dir-scan-race.t\ + tests/bugs/write-behind/bug-1279730.t\ + tests/bugs/cli/bug-1320388.t\ + tests/bugs/ec/bug-1304988.t\ + tests/bugs/glusterd/bug-948686.t\ + tests/bugs/snapshot/bug-1112613.t\ + tests/bugs/snapshot/bug-1087203.t\ + tests/bugs/glusterd/bug-913555.t\ + tests/bugs/snapshot/bug-1202436-calculate-quota-cksum-during-snap-restore.t\ + tests/bugs/snapshot/bug-1140162-file-snapshot-features-encrypt-opts-validation.t\ + tests/bugs/snapshot/bug-1205592.t tests/bugs/glusterd/bug-1231437-rebalance-test-in-cluster.t\ + tests/bugs/snapshot/bug-1227646.t tests/bugs/shard/zero-flag.t\ + tests/bugs/snapshot/bug-1399598-uss-with-ssl.t\ + tests/bugs/snapshot/bug-1049834.t\ + tests/bugs/core/bug-986429.t\ + tests/bugs/bitrot/1207029-bitrot-daemon-should-start-on-valid-node.t\ + tests/bugs/bitrot/1209752-volume-status-should-show-bitrot-scrub-info.t\ + tests/bugs/fb4482137.t\ + tests/bugs/tier/bug-1205545-CTR-and-trash-integration.t\ + tests/features/ipc.t\ + tests/bugs/tier/bug-1286974.t\ + tests/bugs/tier/bug-1279376-rename-demoted-file.t\ + tests/features/ssl-authz.t\ + tests/bugs/glusterd/bug-857330/normal.t\ + tests/bugs/distribute/bug-862967.t\ + tests/basic/quota-anon-fd-nfs.t\ + tests/basic/rpc-coverage.t\ + tests/basic/afr/gfid-mismatch.t\ +" + +DESIRED_TESTS=$(echo $DESIRED_TESTS | tr -s ' ' ' ') +KNOWN_FLAKY_TESTS=$(echo $KNOWN_FLAKY_TESTS | tr -s ' ' ' ') diff --git a/tests/basic/accept-v6v4.t b/tests/basic/accept-v6v4.t new file mode 100755 index 00000000000..ce3a1bae7f9 --- /dev/null +++ b/tests/basic/accept-v6v4.t @@ -0,0 +1,148 @@ +#!/bin/bash + +. $(dirname $0)/../nfs.rc + +# +# This test ensures that GlusterFS provides NFS, Mount and its Management daemon +# over both IPv4 and IPv6. It uses netcat to check the services running on both +# IPv4 & IPv6 addresses as well as a mount to test that mount & nfs work. +# + +IPV4_SUPPORT=false +IPV6_SUPPORT=false + +host $HOSTNAME | grep -q "has address" && IPV4_SUPPORT=true +host $HOSTNAME | grep -q "has IPv6 address" && IPV6_SUPPORT=true + +. $(dirname $0)/../include.rc + +cleanup; + +mkdir -p $B0/b{0,1,2} + +# make sure no registered rpcbind services are running +service rpcbind restart + +TEST glusterd +TEST pidof glusterd + +TEST $CLI vol create $V0 replica 3 $H0:$B0/b0 $H0:$B0/b1 $H0:$B0/b2 + +TEST $CLI vol set $V0 cluster.self-heal-daemon off +TEST $CLI vol set $V0 nfs.disable off +TEST $CLI vol set $V0 cluster.choose-local off +TEST $CLI vol start $V0 + +MOUNTD_PORT=38465 +MGMTD_PORT=24007 +NFSD_PORT=2049 + +function check_ip_port { + ip=$1 + port=$2 + type=$3 + + nc_flags="" + if [ "$type" == "v6" ] && [ "$ip" == "NONE" ]; then + echo "Y" + return + else + nc_flags="-6" + fi + + if [ "$type" == "v4" ] && [ "$ip" == "NONE" ]; then + echo "Y" + return + fi + + if exec 3<>/dev/tcp/$ip/$port; then + echo "Y" + else + echo "N" + fi +} + +function check_nfs { + ip=$1 + type=$2 + + if [ "$ip" == "NONE" ]; then + echo "Y" + return + fi + + if [ "$type" == "v6" ]; then + addr="[$ip]" + else + addr="$ip" + fi + + if mount_nfs $addr:/$V0 $N0; then + umount_nfs $N0 + echo "Y" + else + echo "N" + fi +} + +if [ ! $IPV4_SUPPORT ] && [ ! $IPV6_SUPPORT ]; then + exit 1 +fi + +# Get the V4 & V6 addresses of this host +if $IPV4_SUPPORT; then + V4=$(host $HOSTNAME | head -n1 | awk -F ' ' '{print $4}') +else + V4="NONE" +fi + +if $IPV6_SUPPORT; then + V6=$(host $HOSTNAME | tail -n1 | awk -F ' ' '{print $5}') +else + V6="NONE" +fi + +# First check the management daemon +EXPECT "Y" check_ip_port $V6 $MGMTD_PORT "v6" +EXPECT "Y" check_ip_port $V4 $MGMTD_PORT "v4" + +# Give the MOUNT/NFS Daemon some time to start up +sleep 4 + +EXPECT "Y" check_ip_port $V4 $MOUNTD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $MOUNTD_PORT "v4" + +EXPECT "Y" check_ip_port $V4 $NFSD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $NFSD_PORT "v4" + +# Mount the file system +EXPECT "Y" check_nfs $V6 "v6" +EXPECT "Y" check_nfs $V4 "v4" + +# Test a rpcbind crash +pkill -9 rpcbind && service rpcbind start +sleep 15 + +# Test that the port re-registered +rpcinfo=$(rpcinfo -s | grep nfs | grep -v nfs_acl) + +function check_rpcinfo { + support=$1 + type=$2 + + if [ ! $support ]; then + echo "Y" + return + fi + + if [ "$type" == "v6" ]; then + echo $(echo $rpcinfo | grep tcp6 && echo "Y" || echo "N") + else + echo $(echo $rpcinfo | grep tcp && echo "Y" || echo "N") + fi +} + +EXPECT "Y" check_rpcinfo $IPV4_SUPPORT "v4" +EXPECT "Y" check_rpcinfo $IPV6_SUPPORT "v6" + +cleanup; diff --git a/tests/basic/afr/gfid-unsplit-shd.t b/tests/basic/afr/gfid-unsplit-shd.t new file mode 100644 index 00000000000..77da5243724 --- /dev/null +++ b/tests/basic/afr/gfid-unsplit-shd.t @@ -0,0 +1,98 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 nfs.disable off +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +#EST $CLI volume set $V0 cluster.favorite-child-by-majority off +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +#EST $CLI volume set $V0 cluster.favorite-child-by-size off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +mkdir foo +dd if=/dev/urandom of=foo/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum foo/splitfile | cut -d\ -f1) + +sleep 1 +cd ~ + +GFID_PARENT_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_PARENT_FORMATTED=$(echo "$GFID_PARENT_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo/splitfile 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')" + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. + +# For good measure kill the first brick so the inode cache is wiped, we don't +# want any funny business +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST $CLI volume start $V0 force +pkill -f gluster/glustershd + +rm -f $GFID_LINK_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/foo/splitfile +sleep 1 +TEST touch $B0/${V0}1/foo/splitfile + +mkdir -p $B0/${V0}1/.glusterfs/fd/55 +ln $B0/${V0}1/foo/splitfile $B0/${V0}1/.glusterfs/fd/55/fd551a5c-fddd-4c1a-a4d0-96ef09ef5c08 +cd ~ + +touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_FORMATTED +touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED + +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +EXPECT_WITHIN 60 "0" get_pending_heal_count $V0 + +TEST stat $B0/${V0}1/foo/splitfile + +cd $M0 + +# Tickle the file to trigger the gfid unsplit +TEST stat foo/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=foo/splitfile of=/dev/null 2>/dev/null + +# Verify entry healing happened on the back-end regardless of the +# gfid-splitbrain state of the directory. +TEST stat $B0/${V0}1/foo/splitfile + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum foo/splitfile | cut -d\ -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f foo/splitfile +cd ~ + +cleanup diff --git a/tests/basic/afr/gfid-unsplit-type-mismatch.t b/tests/basic/afr/gfid-unsplit-type-mismatch.t new file mode 100644 index 00000000000..9e205021a0d --- /dev/null +++ b/tests/basic/afr/gfid-unsplit-type-mismatch.t @@ -0,0 +1,86 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 nfs.disable on +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +pkill -f gluster/glustershd + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +dd if=/dev/urandom of=splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum splitfile | cut -d\ -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +rm -fv $B0/${V0}1/splitfile + +# Now really screw the file up, by changing it's type to a directory +# not a file...the so-called "type mismatch" situation. Our test +# should prove we can un-mangle this situation using the same strategy. +mkdir $B0/${V0}1/splitfile +touch -t 199011011510 $B0/${V0}1/splitfile +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile +cd ~ + +touch $M0/newfile + +# Synthetically force a conservative merge of the directory. We want +# to ensure that conservative merges happen in-spite of GFID mis-matches, +# since we can handle them there's no sense in not doing these. In fact, +# if we stop them it will block GFID split-brain resolution. +setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1 +setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 +cd $M0 + +# Tickle the file to trigger the gfid unsplit +TEST stat splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=splitfile of=/dev/null 2>/dev/null +# Verify entry healing happened on the back-end regardless of the +# gfid-splitbrain state of the directory. +TEST stat $B0/${V0}1/splitfile + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum splitfile | cut -d\ -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f splitfile +cd ~ + +cleanup diff --git a/tests/basic/afr/gfid-unsplit.t b/tests/basic/afr/gfid-unsplit.t new file mode 100644 index 00000000000..0b883ab658f --- /dev/null +++ b/tests/basic/afr/gfid-unsplit.t @@ -0,0 +1,120 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../nfs.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 nfs.disable off +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +dd if=/dev/urandom of=$M0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $M0/splitfile | cut -d\ -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +mkdir -p $B0/${V0}1/.glusterfs/fd/55 +ln $B0/${V0}1/splitfile $B0/${V0}1/.glusterfs/fd/55/fd551a5c-fddd-4c1a-a4d0-96ef09ef5c08 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +touch $M0/newfile + +# Synthetically force a conservative merge of the directory. We want +# to ensure that conservative merges happen in-spite of GFID mis-matches, +# since we can handle them there's no sense in not doing these. In fact, +# if we stop them it will block GFID split-brain resolution. +setfattr -n trusted.afr.patchy-client-1 -v 0x000000000000000000000002 $B0/${V0}1 +setfattr -n trusted.afr.patchy-client-2 -v 0x000000000000000000000002 $B0/${V0}1 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $M0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null + +# Verify entry healing happened on the back-end regardless of the +# gfid-splitbrain state of the directory. +TEST stat $B0/${V0}1/splitfile + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $M0/splitfile | cut -d\ -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $M0/splitfile + +# Part II: NFS test +TEST mount_nfs $H0:/$V0 $N0 nolock +#EST mount -t nfs -o nolock,noatime,noacl,soft,intr $H0:/$V0 $N0; + +dd if=/dev/urandom of=$N0/splitfile bs=128k count=5 2>/dev/null + +MD5=$(md5sum $N0/splitfile | cut -d\ -f1) + +# Create a split-brain by downing a brick, and flipping the +# gfid on the down brick, then bring the brick back up. +TEST kill_brick $V0 $H0 $B0/${V0}1 +GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +rm -rf $GFID_DIR_B1 +TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile + +GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')" +#EST rm -f $B0/${V0}3/splitfile +#m -rf $GFID_DIR_B3 + +# Restart the down brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 +sleep 5 + +# Tickle the file to trigger the gfid unsplit +TEST stat $N0/splitfile +sleep 1 + +# Verify the file is readable +TEST dd if=$N0/splitfile of=/dev/null 2>/dev/null + +# Verify the MD5 signature of the file +HEALED_MD5=$(md5sum $N0/splitfile | cut -d\ -f1) +TEST [ "$MD5" == "$HEALED_MD5" ] + +# Verify the file can be removed +TEST rm -f $N0/splitfile + +cleanup diff --git a/tests/basic/afr/metadata-self-heal.t b/tests/basic/afr/metadata-self-heal.t index b88c16a93e1..45bae7bdbfc 100644 --- a/tests/basic/afr/metadata-self-heal.t +++ b/tests/basic/afr/metadata-self-heal.t @@ -50,6 +50,7 @@ function print_pending_heals { TEST glusterd TEST pidof glusterd TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} +TEST $CLI volume set $V0 performance.stat-prefetch off TEST $CLI volume start $V0 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 cd $M0 diff --git a/tests/basic/afr/self-heal.t b/tests/basic/afr/self-heal.t index e1ac17c2d79..f2af52d9773 100644 --- a/tests/basic/afr/self-heal.t +++ b/tests/basic/afr/self-heal.t @@ -194,13 +194,22 @@ TEST rm -rf $M0/* #7. Link/symlink heal +# Make links (especially symlinks) with relative paths instead of absolute +# paths, because absolute paths pointing from the brick to the mountpoint have +# caused problems. +make_link () { + mountpoint=$1; shift + # Do this in a subshell so we don't change "cd -" for the parent. + (cd $mountpoint; ln $*) +} + #Test TEST touch $M0/file -TEST ln $M0/file $M0/link_to_file +TEST make_link $M0 file link_to_file TEST kill_brick $V0 $H0 $B0/brick0 TEST rm -f $M0/link_to_file -TEST ln -s $M0/file $M0/link_to_file -TEST ln $M0/file $M0/hard_link_to_file +TEST make_link $M0 file -s link_to_file +TEST make_link $M0 file hard_link_to_file TEST $CLI volume start $V0 force EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status diff --git a/tests/basic/afr/shd-autofix-nogfid.t b/tests/basic/afr/shd-autofix-nogfid.t new file mode 100644 index 00000000000..7c9026dce62 --- /dev/null +++ b/tests/basic/afr/shd-autofix-nogfid.t @@ -0,0 +1,68 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 nfs.disable on +TEST $CLI volume set $V0 cluster.quorum-type auto +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +# Kill the SHD while we setup the test +pkill -f gluster/glustershd +TEST kill_brick $V0 $H0 $B0/${V0}1 + +mkdir $M0/foo +dd if=/dev/urandom of=$M0/foo/testfile bs=128k count=5 2>/dev/null +MD5=$(md5sum $M0/foo/testfile | cut -d\ -f1) + +mkdir $B0/${V0}1/foo + +# Kick off the SHD and wait 30 seconds for healing to take place +TEST gluster vol start $V0 force +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 + +# Verify the file was healed back to brick 1 +TEST stat $B0/${V0}1/foo/testfile + +# Part II: Test recovery for a file without a GFID +# Kill the SHD while we setup the test +pkill -f gluster/glustershd +TEST kill_brick $V0 $H0 $B0/${V0}1 +rm -f $GFID_LINK_B1 +rm -f $B0/${V0}1/foo/testfile +touch $B0/${V0}1/foo/testfile + +# Queue the directories for healing, don't bother the queue the file +# as this shouldn't be required. +touch $B0/${V0}3/.glusterfs/indices/xattrop/00000000-0000-0000-0000-000000000001 +touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED + +TEST gluster vol start $V0 force +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 +TEST stat $B0/${V0}1/foo/testfile + +# Prove the directory and file are removable +TEST rm -f $B0/${V0}1/foo/testfile +TEST rmdir $B0/${V0}1/foo + +cleanup diff --git a/tests/basic/afr/shd-force-inspect.t b/tests/basic/afr/shd-force-inspect.t new file mode 100644 index 00000000000..caceb841322 --- /dev/null +++ b/tests/basic/afr/shd-force-inspect.t @@ -0,0 +1,61 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 nfs.disable on +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +# Part I: FUSE Test +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +mkdir foo +dd if=/dev/urandom of=foo/testfile bs=128k count=5 2>/dev/null +MD5=$(md5sum foo/testfile | cut -d\ -f1) + +# Kill the SHD while we setup the test +pkill -f gluster/glustershd + +# Grab the GFID of the file and parent dir +GFID_PARENT_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_PARENT_FORMATTED=$(echo "$GFID_PARENT_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/foo/testfile 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')" + +# Nuke the file from brick 1 +rm -f $GFID_LINK_B1 +rm -f $B0/${V0}1/foo/testfile + +# Now manually queue up the parent directory for healing +touch $B0/${V0}2/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED +touch $B0/${V0}3/.glusterfs/indices/xattrop/$GFID_PARENT_FORMATTED + +# Kick off the SHD and wait 30 seconds for healing to take place +TEST gluster vol start patchy force +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 + +# Verify the file was healed back to brick 1 +TEST stat $B0/${V0}1/foo/testfile + +cleanup diff --git a/tests/basic/afr/shd-pgfid-heal.t b/tests/basic/afr/shd-pgfid-heal.t new file mode 100644 index 00000000000..6213e4c6374 --- /dev/null +++ b/tests/basic/afr/shd-pgfid-heal.t @@ -0,0 +1,81 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 nfs.disable on +TEST $CLI volume set $V0 cluster.quorum-type none +#EST $CLI volume set $V0 cluster.favorite-child-by-majority on +#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on +TEST $CLI volume set $V0 cluster.pgfid-self-heal on +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume set $V0 storage.build-pgfid on +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +sleep 5 + +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +cd $M0 +mkdir -p a/b/c +dd if=/dev/urandom of=a/b/c/testfile bs=128k count=5 2>/dev/null + +# Kill the SHD while we setup the test +pkill -f gluster/glustershd +# Kill the brick as well such that +TEST kill_brick $V0 $H0 $B0/${V0}1 + +echo stuff >> $M0/a/b/c/testfile +MD5=$(md5sum a/b/c/testfile | cut -d\ -f1) + +# Grab the GFID of the file and parent dir +GFID_PARENT_B_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_PARENT_B_FORMATTED=$(echo "$GFID_PARENT_B_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_PARENT_B_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_PARENT_B_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')" +GFID_PARENT_C_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b/c 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_PARENT_C_FORMATTED=$(echo "$GFID_PARENT_C_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_PARENT_C_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_PARENT_C_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')" +GFID_RAW=$(getfattr -n trusted.gfid -e hex $B0/${V0}1/a/b/c/testfile 2>/dev/null | grep trusted.gfid | cut -d= -f2) +GFID_FORMATTED=$(echo "$GFID_RAW" | awk '{print substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}') +GFID_LINK_B1="$B0/${V0}1/.glusterfs/$(echo $GFID_RAW | awk '{print substr($0,3,2)"/"substr($0,5,2)"/"substr($1,3,8)"-"substr($1,11,4)"-"substr($1,15,4)"-"substr($1,19,4)"-"substr($1,23,12)}')" + +# +# Here we are going to create a situation such that a file 3 +# levels deep into the FS requires healing, along with 2 levels +# of parent directories. The only signal SHD has is that the +# file itself needs healing. The directory (entry) heals are +# missing; simulating a crash or some sort of bug that we need +# to be able to recover from. +# + +# Nuke the file from brick 1, along with the parent directories +# and all backend hard/symbolic links +rm -f $B0/${V0}1/a/b/c/testfile +rm -f $GFID_LINK_B1 +rmdir $B0/${V0}1/a/b/c +rm -f $GFID_PARENT_C_LINK_B1 +rmdir $B0/${V0}1/a/b +rm -f $GFID_PARENT_B_LINK_B1 + +# Kick off the SHD and wait 30 seconds for healing to take place +TEST gluster vol start patchy force +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 +sleep 5 + +# Verify the file was healed back to brick 1 +TEST stat $B0/${V0}1/a/b/c/testfile + +cleanup diff --git a/tests/basic/bd.t b/tests/basic/bd.t index 63622edd709..11582db81c0 100755 --- a/tests/basic/bd.t +++ b/tests/basic/bd.t @@ -86,6 +86,7 @@ TEST pidof glusterd configure TEST $CLI volume create $V0 ${H0}:/$B0/$V0?${V0} +TEST $CLI volume set $V0 performance.stat-prefetch off EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; diff --git a/tests/basic/cache.t b/tests/basic/cache.t new file mode 100644 index 00000000000..92251732f4a --- /dev/null +++ b/tests/basic/cache.t @@ -0,0 +1,69 @@ +#!/bin/bash +# + +FILE=/var/log/glusterfs/samples/glusterfs_patchy.samp +rm $FILE + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function print_cnt() { + local FOP_TYPE=$1 + local FOP_CNT=$(grep ,${FOP_TYPE} $FILE | wc -l) + echo $FOP_CNT +} + +function print_avg() { + local FOP_TYPE=$1 + local FILE=/var/log/glusterfs/samples/glusterfs_patchy.samp + local FOP_AVG=$(grep -oE "${FOP_TYPE},[0-9]+\." ${FILE} | grep -oE '[0-9]+' | awk 'NR == 1 { sum = 0 } { sum += $1; } END {printf "%d", sum/NR}') + echo $FOP_AVG +} + +cleanup; +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 nfs.disable off +TEST $CLI volume set $V0 diagnostics.latency-measurement on +TEST $CLI volume set $V0 diagnostics.count-fop-hits on +TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535 +TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1 +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 1 +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +for i in {1..100} +do + df $M0 &> /dev/null +done + +sleep 6 + +# Get average +STATFS_CNT0=$(print_cnt STATFS) +TEST [ "$STATFS_CNT0" -gt "0" ] +STATFS_AVG0=$(print_avg STATFS) +# Make it easier to compute averages +rm $FILE + +TEST $CLI volume set $V0 performance.nfs.io-cache on +TEST $CLI volume set $V0 performance.statfs-cache on +TEST $CLI volume set $V0 performance.statfs-cache-timeout 10 + +for i in {1..100} +do + df $M0 &> /dev/null +done + +sleep 6 + +# Get average +STATFS_CNT1=$(print_cnt STATFS) +TEST [ "$STATFS_CNT1" -eq "$STATFS_CNT0" ] +STATFS_AVG1=$(print_avg STATFS) + +# Verify that cached average * 10 is still faster than uncached +STATFS_AVG1x10=$(($STATFS_AVG1 * 10)) +TEST [ "$STATFS_AVG0" -gt "$STATFS_AVG1x10" ] +#cleanup diff --git a/tests/basic/dht-min-free-space.t b/tests/basic/dht-min-free-space.t new file mode 100755 index 00000000000..9553f9247aa --- /dev/null +++ b/tests/basic/dht-min-free-space.t @@ -0,0 +1,69 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../traps.rc + +grep $B0/patchy1 /proc/mounts &> /dev/null && umount $B0/patchy1 +grep $B0/patchy2 /proc/mounts &> /dev/null && umount $B0/patchy2 +mkdir $B0/${V0}{1..2} + +TEST glusterd + +TEST truncate --size $((30*1048576)) $B0/${V0}-dev1 +push_trapfunc "rm -f $B0/${V0}-dev1" +TEST truncate --size $((30*1048576)) $B0/${V0}-dev2 +push_trapfunc "rm -f $B0/${V0}-dev2" + +TEST mkfs.xfs $B0/${V0}-dev1 +TEST mkfs.xfs $B0/${V0}-dev2 + +TEST mount -o loop $B0/${V0}-dev1 $B0/${V0}1 +TEST mount -o loop $B0/${V0}-dev2 $B0/${V0}2 + +TEST $CLI volume create $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}2 +TEST $CLI volume set $V0 cluster.min-free-disk 2MB +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 0 +TEST $CLI volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +#################################### +# Test re-directs of file creation # +#################################### + +# This should work, no redirects +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=8 +TEST [ -f /d/backends/${V0}2/testfile1 ] && [ ! -k /d/backends/${V0}1/testfile1 ] + +TEST $CLI volume set $V0 cluster.min-free-disk 19MB + +# This should work, & the file redirected +# Subvolume 2 should have the linkto & +# Subvolume 1 should have the original +TEST dd if=/dev/zero of=$M0/testfile3 bs=1M count=4 +TEST [ -f /d/backends/${V0}1/testfile3 ] && [ ! -k /d/backends/${V0}1/testfile3 ] +TEST [ -k /d/backends/${V0}2/testfile3 ] + +# This should fail, cluster is full +TEST ! dd if=/dev/zero of=$M0/testfile2 bs=1M count=23 + +################### +# Strict mode off # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode off +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=20 +TEST rm -f $M0/testfile1 + +################### +# Strict mode on # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST ! dd if=/dev/zero of=$M0/testfile1 bs=1M count=16 +TEST rm -f $M0/testfile1 + +# Cleanup will deal with our mounts for us, and (because we used "-o loop") our +# device files too, but not the underlying files. That will happen in the EXIT +# trap handler instead. +cleanup; diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common index 83c4463a912..152e3b51236 100644 --- a/tests/basic/ec/ec-common +++ b/tests/basic/ec/ec-common @@ -45,7 +45,7 @@ for size in $SIZE_LIST; do eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }') done -TEST df -h +TEST df -h $M0 TEST stat $M0 for idx in `seq 0 $LAST_BRICK`; do diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 98dd9232c73..3e3467535fb 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -136,7 +136,7 @@ TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024 cs=$(sha1sum $tmp/test | awk '{ print $1 }') -TEST df -h +TEST df -h $M0 TEST stat $M0 for idx in {0..5}; do diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t index fdaf9c2822e..da88bbcb2cc 100644 --- a/tests/basic/exports_parsing.t +++ b/tests/basic/exports_parsing.t @@ -32,7 +32,20 @@ function test_bad_opt () glusterfsd --print-exports $1 2>&1 | sed -n 1p } -EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports +function check_export_line() { + if [ "$1" == "$2" ]; then + echo "Y" + else + echo "N" + fi + return +} + +export_result=$(test_good_file $EXP_FILES/exports) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result" + +export_result=$(test_good_file $EXP_FILES/exports-v6) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result" EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t index cea8aa737c0..713c7e27579 100644 --- a/tests/basic/fop-sampling.t +++ b/tests/basic/fop-sampling.t @@ -2,13 +2,27 @@ # . $(dirname $0)/../include.rc +. $(dirname $0)/../nfs.rc . $(dirname $0)/../volume.rc -SAMPLE_FILE="$(gluster --print-logdir)/samples/glusterfs_${V0}.samp" +BRICK_SAMPLES="$(gluster --print-logdir)/samples/glusterfsd__d_backends_${V0}0.samp" +NFS_SAMPLES="$(gluster --print-logdir)/samples/glusterfs_nfsd.samp" + +function check_path { + op=$1 + path=$2 + file=$3 + grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null + if [ $? -eq 0 ]; then + echo "Y" + else + echo "N" + fi +} function print_cnt() { local FOP_TYPE=$1 - local FOP_CNT=$(grep ,${FOP_TYPE} ${SAMPLE_FILE} | wc -l) + local FOP_CNT=$(grep ,${FOP_TYPE} ${BRICK_SAMPLES} | wc -l) echo $FOP_CNT } @@ -42,12 +56,18 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} TEST $CLI volume set $V0 nfs.disable off TEST $CLI volume set $V0 diagnostics.latency-measurement on TEST $CLI volume set $V0 diagnostics.count-fop-hits on -TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 5 TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535 TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1 TEST $CLI volume set $V0 diagnostics.stats-dnscache-ttl-sec 3600 - TEST $CLI volume start $V0 + +>${NFS_SAMPLES} +>${BRICK_SAMPLES} + +################# +# Basic Samples # +################# TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 for i in {1..5} @@ -58,4 +78,52 @@ done TEST ls -l $M0 EXPECT_WITHIN 6 "OK" check_samples -cleanup +sleep 2 + +################################ +# Paths in the samples # +################################ + +TEST mount_nfs $H0:$V0 $N0 + +ls $N0 &> /dev/null +touch $N0/file1 +stat $N0/file1 &> /dev/null +echo "some data" > $N0/file1 +dd if=/dev/zero of=$N0/file2 bs=1M count=10 conv=fsync +dd if=/dev/zero of=$N0/file1 bs=1M count=1 +cat $N0/file2 &> /dev/null +mkdir -p $N0/dir1 +rmdir $N0/dir1 +rm $N0/file1 +rm $N0/file2 + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FINODELK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ENTRYLK / $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $BRICK_SAMPLES + + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path READ /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $NFS_SAMPLES + +cleanup; diff --git a/tests/basic/fops-sanity-gfproxy.t b/tests/basic/fops-sanity-gfproxy.t new file mode 100755 index 00000000000..b3bb8a502cc --- /dev/null +++ b/tests/basic/fops-sanity-gfproxy.t @@ -0,0 +1,32 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/brick1; +EXPECT 'Created' volinfo_field $V0 'Status'; + +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +#gfproxy server +TEST glusterfs --volfile-id=gfproxy/$V0 --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log + +#mount on a random dir +TEST glusterfs --entry-timeout=3600 --attribute-timeout=3600 -s $H0 --volfile-id=gfproxy-client/$V0 $M0 --direct-io-mode=yes +TEST grep gfproxy-client /proc/mounts + +build_tester $(dirname $0)/fops-sanity.c + +TEST cp $(dirname $0)/fops-sanity $M0 +cd $M0 +TEST ./fops-sanity $V0 +cd - +rm -f $(dirname $0)/fops-sanity + +cleanup; diff --git a/tests/basic/gfid-access.t b/tests/basic/gfid-access.t index 19b6564e676..fc29a19fc6c 100644 --- a/tests/basic/gfid-access.t +++ b/tests/basic/gfid-access.t @@ -8,6 +8,7 @@ cleanup; TEST glusterd TEST pidof glusterd TEST $CLI volume create $V0 $H0:$B0/${V0}0 +TEST $CLI volume set $V0 performance.stat-prefetch off TEST $CLI volume start $V0 TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 --aux-gfid-mount; TEST mkdir $M0/a diff --git a/tests/basic/gfproxy.t b/tests/basic/gfproxy.t new file mode 100644 index 00000000000..71c6788db76 --- /dev/null +++ b/tests/basic/gfproxy.t @@ -0,0 +1,74 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../nfs.rc + +cleanup; + +function start_gfproxyd { + glusterfs --volfile-id=gfproxy/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log +} + +function restart_gfproxyd { + pkill -f gfproxy/${V0} + start_gfproxyd +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 config.gfproxyd-remote-host $H0 +TEST $CLI volume start $V0 + +sleep 2 + +REGULAR_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-fuse.vol" +GFPROXY_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-gfproxy-fuse.vol" +GFPROXYD_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.gfproxyd.vol" + +# Client volfile must exist +TEST [ -f $GFPROXY_CLIENT_VOLFILE ] + +# AHA & write-behind translators must exist +TEST grep "cluster/aha" $GFPROXY_CLIENT_VOLFILE +TEST grep "performance/write-behind" $GFPROXY_CLIENT_VOLFILE + +# Make sure we didn't screw up the existing client +TEST grep "performance/write-behind" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/replicate" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/distribute" $REGULAR_CLIENT_VOLFILE + +TEST [ -f $GFPROXYD_VOLFILE ] + +TEST grep "cluster/replicate" $GFPROXYD_VOLFILE +TEST grep "cluster/distribute" $GFPROXYD_VOLFILE + +# AHA & write-behind must *not* exist +TEST ! grep "cluster/aha" $GFPROXYD_VOLFILE +TEST ! grep "performance/write-behind" $GFPROXYD_VOLFILE + +# Test that we can start the server and the client +TEST start_gfproxyd +TEST glusterfs --volfile-id=gfproxy-client/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy-client.log $M0 +sleep 2 +TEST grep gfproxy-client/${V0} /proc/mounts + +# Write data to the mount and checksum it +TEST dd if=/dev/urandom bs=1M count=10 of=/tmp/testfile1 +md5=$(md5sum /tmp/testfile1 | awk '{print $1}') +TEST cp -v /tmp/testfile1 $M0/testfile1 +TEST [ "$(md5sum $M0/testfile1 | awk '{print $1}')" == "$md5" ] + +rm /tmp/testfile1 + +dd if=/dev/zero of=$N0/bigfile bs=1M count=3072 & +BG_STRESS_PID=$! + +sleep 3 + +restart_gfproxyd + +TEST wait $BG_STRESS_PID + +cleanup; diff --git a/tests/basic/glusterd/volfile_server_switch.t b/tests/basic/glusterd/volfile_server_switch.t index 0b0e6470244..0b01398215c 100644 --- a/tests/basic/glusterd/volfile_server_switch.t +++ b/tests/basic/glusterd/volfile_server_switch.t @@ -1,5 +1,8 @@ #!/bin/bash +#G_TESTDEF_TEST_STATUS_CENTOS6=KNOWN_ISSUE,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=KNOWN_ISSUE,BUG=000000 + . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc . $(dirname $0)/../../cluster.rc diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t new file mode 100644 index 00000000000..f3655eaef3b --- /dev/null +++ b/tests/basic/halo-failover-disabled.t @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +# brick immediatelly, and md5s will show they are equal once +# the write completes. +# 4. The mount should also be RW after the brick is killed as +# quorum will be immediately restored by swapping in the +# other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.halo-failover-enabled off +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG + +# Use a large ping time here so the spare brick is not marked up +# based on the ping time. The only way it can get marked up is +# by being swapped in via the down event (which is what we are disabling). +TEST $CLI volume set $V0 network.ping-timeout 1000 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX} + +# Make sure two children are down and one is up. +EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3 + +# Test that quorum should fail and the mount is RO, the reason here +# is that although there _is_ another brick running which _could_ +# take the failed bricks place, it is not marked "up" so quorum +# will not be fullfilled. If we waited 1000 second the brick would +# indeed be activated based on ping time, but for our test we want +# the decision to be solely "down event" driven, not ping driven. +TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX + +# Test that quorum should be restored and the file is writable +TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 + +cleanup diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t new file mode 100644 index 00000000000..7d23d80968a --- /dev/null +++ b/tests/basic/halo-failover-enabled.t @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +# brick immediatelly, and md5s will show they are equal once +# the write completes. +# 4. The mount should also be RW after the brick is killed as +# quorum will be immediately restored by swapping in the +# other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-failover-enabled on +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 network.ping-timeout 20 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST [ -n "$KILL_IDX" ] +# NB: UP_CHILDREN is the set of children that should be up after we kill +# the brick indicated by KILL_IDX, *not* the set of children which are +# currently up! +UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g")) +UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)" +UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)" +VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)" + +# The victim brick should have a copy of the file. +TEST [ -n "$VICTIM_HAS_TEST" ] + +# Of the bricks which will remain standing, there should be only one +# brick which has the file called test. If the both have the first +# test file, the test is invalid as all the bricks are up and the +# halo-max-replicas is not being honored; e.g. bug exists. +TEST [ $([ -z "$UP1_HAS_TEST" ]) = $([ -z "$UP2_HAS_TEST" ]) ] + +echo "Failing child ${KILL_IDX}..." +TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX} + +# Test the mount is still RW (i.e. quorum works) +TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync + +# Calulate the MD5s +MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1) +MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1) + +# Verify the two up bricks have identical MD5s, if both are identical +# then we must have successfully failed-over to the brick which was +# previously proven to be down (via the ONLY_ONE test). +TEST [ "$MD5_UP1" == "$MD5_UP2" ] + +cleanup diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t new file mode 100644 index 00000000000..4574fdfe41e --- /dev/null +++ b/tests/basic/halo-hybrid.t @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Test for the Halo hybrid feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +# heal daemon is off to start. +# 2. Write some data +# 3. Verify hybrid code chose children for lookups +# 4. Verify hybrid code chose child for reads +# 5. Verify hybrid code wrote synchronously to all replicas +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function found_fuse_log_msg { + local dir="$1" + local msg="$2" + local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l) + if (( $cnt == 1 )); then + echo "Y" + else + echo "N" + fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-hybrid-mode True +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level TRACE +TEST $CLI volume start $V0 + +# Start a synchronous mount +TEST glusterfs --volfile-id=/$V0 \ + --xlator-option *replicate*.halo-max-latency=9999 \ + --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 +sleep 2 +cd $M0 + +TEST mkdir testdir +TEST cd testdir +for i in {1..5} +do + dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null +done +TEST ls -l + +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs" +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child" + +B0_CNT=$(ls $B0/${V0}0/testdir | wc -l) +B1_CNT=$(ls $B0/${V0}1/testdir | wc -l) +B2_CNT=$(ls $B0/${V0}2/testdir | wc -l) + +# Writes should be synchronous, all should have same +# file count +TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))" + +cleanup diff --git a/tests/basic/halo.t b/tests/basic/halo.t new file mode 100644 index 00000000000..25aca3442ab --- /dev/null +++ b/tests/basic/halo.t @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Test for the Halo geo-replication feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +# heal daemon is off to start. +# 2. Write some data +# 3. Verify at least one of the bricks did not receive the writes. +# 4. Turn the heal daemon on +# 5. Within 30 seconds the SHD should async heal the data over +# to the 3rd brick. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +for i in {1..5} +do + dd if=/dev/urandom of=f bs=1M count=1 2>/dev/null + mkdir a; cd a; +done + +B0_CNT=$(ls $B0/${V0}0 | wc -l) +B1_CNT=$(ls $B0/${V0}1 | wc -l) +B2_CNT=$(ls $B0/${V0}2 | wc -l) + +# One of the brick dirs should be empty +TEST "(($B0_CNT == 0 || $B1_CNT == 0 || $B2_CNT == 0))" + +# Ok, turn the heal daemon on and verify it heals it up +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 +cleanup diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t index 9df5cb45c3b..7f990c9aeb2 100755 --- a/tests/basic/mount-nfs-auth.t +++ b/tests/basic/mount-nfs-auth.t @@ -3,6 +3,13 @@ . $(dirname $0)/../include.rc . $(dirname $0)/../nfs.rc +# On test systems, connecting to ourselves by hostname appears at the other end +# as coming from localhost, so that's what needs to go in exports files etc. +# The only place we really need to use the actual hostname is in the Gluster +# volume-create thing. Maybe it's an IPv6 thing, maybe it's just a crazy +# resolver configuration, but this lets the test work. +H0=localhost + # Our mount timeout must be as long as the time for a regular configuration # change to be acted upon *plus* AUTH_REFRESH_TIMEOUT, not one replacing the # other. Otherwise this process races vs. the one making the change we're @@ -15,6 +22,9 @@ TEST glusterd TEST pidof glusterd TEST $CLI volume info +H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1) +H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}') + # Export variables for allow & deny EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" @@ -28,13 +38,21 @@ V0L1="$V0/L1" V0L2="$V0L1/L2" V0L3="$V0L2/L3" +NETGROUP_COMPLEX_ALLOW="storage storage.region\nstorage.region (1.2.3.4,,)\nngtop ng1\nng1 ($H0,,)" +EXPORT_COMPLEX_RO_ALLOW="/$V0L1 @storage(sec=sys,rw,anonuid=0) @ngtop(sec=sys,ro,anonuid=0)" + # Other variations for allow & deny +EXPORT_ALLOW_NETGROUP_RO="/$V0 @ngtop(sec=sys,ro,anonuid=0)" EXPORT_ALLOW_RO="/$V0 $H0(sec=sys,ro,anonuid=0) @ngtop(sec=sys,ro,anonuid=0)" EXPORT_ALLOW_L1="/$V0L1 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" EXPORT_WILDCARD="/$V0 *(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" function build_dirs () { - mkdir -p $B0/b{0,1,2}/L1/L2/L3 + mkdir -p $B0/b{0,1,2,3,4,5}/L1/L2/L3 +} + +function export_allow_this_host_ipv6 () { + printf "$EXPORT_ALLOW6\n" > ${NFSDIR}/exports } function export_allow_this_host () { @@ -46,6 +64,9 @@ function export_allow_this_host_with_slash () { } function export_deny_this_host () { + if [[ "$1" && "$1" != "$V0" ]]; then + local EXPORT_DENY=$(echo $EXPORT_DENY | sed "s/$V0/$1/") + fi printf "$EXPORT_DENY\n" > ${NFSDIR}/exports } @@ -61,6 +82,10 @@ function export_allow_this_host_ro () { printf "$EXPORT_ALLOW_RO\n" > ${NFSDIR}/exports } +function export_allow_netgroup_ro () { + printf "$EXPORT_ALLOW_NETGROUP_RO\n" > ${NFSDIR}/exports +} + function netgroup_allow_this_host () { printf "$NETGROUP_ALLOW\n" > ${NFSDIR}/netgroups } @@ -69,8 +94,16 @@ function netgroup_deny_this_host () { printf "$NETGROUP_DENY\n" > ${NFSDIR}/netgroups } +function netgroup_complex_allow() { + printf "$NETGROUP_COMPLEX_ALLOW\n" > ${NFSDIR}/netgroup +} + +function export_complex_ro_allow() { + printf "$EXPORT_COMPLEX_RO_ALLOW\n" > ${NFSDIR}/exports +} + function create_vol () { - $CLI vol create $V0 $H0:$B0/b0 + $CLI vol create $V0 $(hostname):$B0/b0 } function setup_cluster() { @@ -104,6 +137,10 @@ function check_mount_failure { fi } +function do_mount () { + mount_nfs $H0:/$1 $N0 nolock +} + function small_write () { dd if=/dev/zero of=$N0/test-small-write count=1 bs=1k 2>&1 if [ $? -ne 0 ]; then @@ -150,10 +187,7 @@ setup_cluster TEST $CLI vol set $V0 nfs.disable off TEST $CLI vol start $V0 -# Get NFS state directory -NFSDIR=$( $CLI volume get patchy nfs.mount-rmtab | \ - awk '/^nfs.mount-rmtab/{print $2}' | \ - xargs dirname ) +NFSDIR=/var/lib/glusterd/nfs ## Wait for volume to register with rpc.mountd EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available @@ -186,6 +220,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available ## Mount NFS EXPECT "Y" check_mount_success $V0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + +## Mount NFS using the IPv6 export +export_allow_this_host_ipv6 +EXPECT "Y" check_mount_success $V0 ## Disallow host TEST export_deny_this_host @@ -260,6 +299,31 @@ TEST ! create # Create should not be allowed TEST stat_nfs # Stat should be allowed EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 +TEST export_allow_netgroup_ro +TEST netgroup_allow_this_host +sleep $((AUTH_REFRESH_INTERVAL+1)) + +EXPECT_WITHIN $MY_MOUNT_TIMEOUT "Y" check_mount_success $V0 +# TBD: figure out why these two tests fail, so they can be reenabled +#EST ! small_write # Writes should not be allowed +#EST ! create # Create should not be allowed +TEST stat_nfs # Stat should be allowed +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + +# This test checks the case where the exports file +# has a 'rw' perm set for a netgroup followed +# by a 'ro' perm for a different netgroup. +TEST netgroup_complex_allow +TEST export_complex_ro_allow +sleep $((AUTH_REFRESH_INTERVAL+1)) + +EXPECT_WITHIN $MY_MOUNT_TIMEOUT "Y" check_mount_success $V0L1 +# TBD: figure out why these two tests fail, so they can be reenabled +#EST ! small_write # Writes should not be allowed +#EST ! create # Create should not be allowed +TEST stat_nfs # Stat should be allowed +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + TEST export_deny_this_host TEST netgroup_deny_this_host TEST export_allow_this_host_l1 # Allow this host at L1 @@ -320,9 +384,40 @@ TEST $CLI vol set $V0 nfs.auth-refresh-interval-sec 20 ## Do a simple test to see if the volume option exists TEST $CLI vol set $V0 nfs.auth-cache-ttl-sec 400 +## Test authentication in 1 of 2 (sub)volumes +ME=$(hostname) +TEST $CLI vol create $V1 replica 3 $ME:$B0/b3 $ME:$B0/b4 $ME:$B0/b5 +TEST $CLI vol set $V1 cluster.self-heal-daemon off +TEST $CLI vol set $V1 nfs.disable off +TEST $CLI vol set $V1 cluster.choose-local off +TEST $CLI vol start $V1 +TEST $CLI volume info $V1; + +EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "2" is_nfs_export_available $V0 +EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available $V1 +TEST $CLI vol set $V0 nfs.exports-auth-enable on +TEST $CLI vol set $V1 nfs.exports-auth-enable off +# Deny the hosts, but only effective on $V0 +TEST export_deny_this_host $V0 +TEST netgroup_deny_this_host +TEST export_deny_this_host $V1 + +sleep $AUTH_REFRESH_INTERVAL +TEST ! do_mount $V0 # Do a mount & test +TEST do_mount $V1 # Do a mount & test + +TEST touch /tmp/foo +TEST cp /tmp/foo $N0/ + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + ## Finish up TEST $CLI volume stop $V0 TEST $CLI volume delete $V0; TEST ! $CLI volume info $V0; +TEST $CLI volume stop $V1 +TEST $CLI volume delete $V1; +TEST ! $CLI volume info $V1; + cleanup diff --git a/tests/basic/pgfid-feat.t b/tests/basic/pgfid-feat.t index a7baeec7b7a..615a0cd867e 100644 --- a/tests/basic/pgfid-feat.t +++ b/tests/basic/pgfid-feat.t @@ -16,6 +16,7 @@ TEST pidof glusterd TEST $CLI volume info; TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4}; +TEST $CLI volume set $V0 performance.stat-prefetch off TEST $CLI volume set $V0 build-pgfid on; TEST $CLI volume start $V0; diff --git a/tests/basic/quota-anon-fd-nfs.t b/tests/basic/quota-anon-fd-nfs.t index d911cc90b87..a6dec6bfcf8 100755 --- a/tests/basic/quota-anon-fd-nfs.t +++ b/tests/basic/quota-anon-fd-nfs.t @@ -17,6 +17,7 @@ TEST pidof glusterd TEST $CLI volume info; TEST $CLI volume create $V0 $H0:$B0/brick1; +TEST $CLI volume set $V0 performance.stat-prefetch off EXPECT 'Created' volinfo_field $V0 'Status'; TEST $CLI volume set $V0 nfs.disable false diff --git a/tests/basic/quota.t b/tests/basic/quota.t index 7f8b21de6f8..99af5a4e7e4 100755 --- a/tests/basic/quota.t +++ b/tests/basic/quota.t @@ -19,6 +19,7 @@ TEST pidof glusterd TEST $CLI volume info; TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4}; +TEST $CLI volume set $V0 performance.stat-prefetch off EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; diff --git a/tests/basic/rpc-coverage.t b/tests/basic/rpc-coverage.t index a76ba7084eb..b5221dcd9dd 100755..100644 --- a/tests/basic/rpc-coverage.t +++ b/tests/basic/rpc-coverage.t @@ -10,6 +10,7 @@ TEST pidof glusterd TEST $CLI volume info; TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4,5,6,7,8}; +TEST $CLI volume set $V0 performance.stat-prefetch off EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; diff --git a/tests/basic/stats-dump.t b/tests/basic/stats-dump.t index 7da6e0605a4..2840498218b 100644 --- a/tests/basic/stats-dump.t +++ b/tests/basic/stats-dump.t @@ -12,6 +12,7 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} TEST $CLI volume set $V0 diagnostics.latency-measurement on TEST $CLI volume set $V0 diagnostics.count-fop-hits on TEST $CLI volume set $V0 diagnostics.stats-dump-interval 1 +TEST $CLI volume set $V0 performance.nfs.io-threads on TEST $CLI volume set $V0 nfs.disable off TEST $CLI volume start $V0 EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available @@ -36,6 +37,10 @@ NFSD_RET="$?" FUSE_OUTPUT="$(grep 'aggr.fop.write.count": "0"' ${GLUSTERD_WORKDIR}/stats/glusterfs_patchy.dump)" FUSE_RET="$?" +# Test that io-stats is getting queue sizes from io-threads +TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfs_nfsd.dump +TEST grep 'queue_size' ${GLUSTERD_WORKDIR}/stats/glusterfsd__d_backends_patchy?.dump + TEST [ 0 -ne "$BRICK_RET" ] TEST [ 0 -ne "$NFSD_RET" ] TEST [ 0 -ne "$FUSE_RET" ] diff --git a/tests/basic/uss.t b/tests/basic/uss.t index 6cfc0303895..d6ca416bd65 100644 --- a/tests/basic/uss.t +++ b/tests/basic/uss.t @@ -382,3 +382,5 @@ TEST ls $M0/.history/snap6/; TEST ! stat $M0/.history/snap6/aaa; cleanup; + +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 diff --git a/tests/basic/write-behind.t b/tests/basic/write-behind.t new file mode 100644 index 00000000000..edad59786af --- /dev/null +++ b/tests/basic/write-behind.t @@ -0,0 +1,53 @@ +#!/bin/bash +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function clear_stats { + > /var/lib/glusterfs/stats/glusterfs_d_backends_${V0}0.dump +} + +function got_expected_write_count { + expected_size=$1 + expected_value=$2 + grep aggr.write_${expected_size} "/var/lib/glusterd/stats/glusterfsd__d_backends_${V0}0.dump" | grep $expected_value + if [ $? == 0 ]; then + echo "Y"; + else + echo "N"; + fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} + +# These are needed for our tracking of write sizes +TEST $CLI volume set $V0 diagnostics.latency-measurement on +TEST $CLI volume set $V0 diagnostics.count-fop-hits on +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 + +# Disable this in testing to get deterministic results +TEST $CLI volume set $V0 performance.write-behind-trickling-writes off + +TEST $CLI volume start $V0 + +sleep 2; + +TEST glusterfs -s $H0 --volfile-id $V0 $M0 + +# Write a 100MB file with a window-size 1MB, we should get 100 writes of 1MB each +TEST dd if=/dev/zero of=$M0/100mb_file bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "1mb" 100 + +TEST $CLI volume set $V0 performance.write-behind-window-size 512KB + +# Write a 100MB file with a window-size 512KB, we should get 200 writes of 512KB each +TEST dd if=/dev/zero of=$M0/100mb_file_2 bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "512kb" 200 + +cleanup; diff --git a/tests/bugs/distribute/bug-1099890.t b/tests/bugs/distribute/bug-1099890.t index 1a19ba880c0..9f8ae1487cc 100644 --- a/tests/bugs/distribute/bug-1099890.t +++ b/tests/bugs/distribute/bug-1099890.t @@ -44,6 +44,8 @@ TEST $CLI volume set $V0 features.quota-deem-statfs on TEST $CLI volume quota $V0 limit-usage / 150MB; +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1 + TEST $CLI volume set $V0 cluster.min-free-disk 50% TEST glusterfs -s $H0 --volfile-id=$V0 $M0 diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t index c5a7f041ac8..8cf905a8f0b 100755 --- a/tests/bugs/distribute/bug-1161311.t +++ b/tests/bugs/distribute/bug-1161311.t @@ -53,8 +53,14 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0; TEST mkdir $M0/dir1 TEST mkdir -p $M0/dir2/dir3 -# Create a large file (1GB), so that rebalance takes time -dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 +# Create a large file (6.4 GB), so that rebalance takes time +# Reading from /dev/urandom is slow, so we'll cat it together +dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240 +for i in {1..10}; do + cat /tmp/FILE2 >> $M0/dir1/FILE2 +done + +#dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 # Rename the file to create a linkto, for rebalance to # act on the file diff --git a/tests/bugs/fb4482137.t b/tests/bugs/fb4482137.t new file mode 100755 index 00000000000..bd3be89326b --- /dev/null +++ b/tests/bugs/fb4482137.t @@ -0,0 +1,65 @@ +#!/bin/bash + +# +# Test the scenario where a SHD daemon suffers a frame timeout during a +# crawl. The expected behavior is that present crawl will continue +# after the timeout and not deadlock. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +function wait_for_shd_no_sink() { + local TIMEOUT=$1 + # If we see the "no active sinks" log message we know + # the heal is alive. It cannot proceed as the "sink" + # is hung, but it's at least alive and trying. + timeout $TIMEOUT grep -q 'replicate-0: no active sinks for' \ + <(tail -fn0 /var/log/glusterfs/glustershd.log) + return $? +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info 2> /dev/null; + +# Setup a cluster with 3 replicas, and fav child by majority on +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}; +TEST $CLI volume set $V0 network.frame-timeout 2 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 cluster.heal-timeout 10 +TEST $CLI volume start $V0 +sleep 5 + +# Mount the volume +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \ + --attribute-timeout=0 --entry-timeout=0 + +# Kill bricks 1 +TEST kill_brick $V0 $H0 $B0/${V0}1 +sleep 1 + +# Write some data into the mount which will require healing +cd $M0 +for i in {1..1000}; do + dd if=/dev/urandom of=testdata_$i bs=64k count=1 2>/dev/null +done + +# Re-start the brick +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 + +sleep 1 +TEST hang_brick $V0 $H0 $B0/${V0}1 +sleep 4 +TEST wait_for_shd_no_sink 20 +cleanup + +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 diff --git a/tests/bugs/fb8149516.t b/tests/bugs/fb8149516.t new file mode 100644 index 00000000000..54372794c6f --- /dev/null +++ b/tests/bugs/fb8149516.t @@ -0,0 +1,40 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.read-subvolume-index 2 +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.heal-timeout 30 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 nfs.disable off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 +for i in {1..10} +do + dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null +done +cd ~ +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST rm -rf $B0/${V0}2/testfile* +TEST rm -rf $B0/${V0}2/.glusterfs + +TEST $CLI volume start $V0 force +EXPECT_WITHIN 20 "1" afr_child_up_status_in_shd $V0 2 + +# Verify we see all ten files when ls'ing, without the patch this should +# return no files and fail. +FILE_LIST=($(\ls $M0)) +TEST "((${#FILE_LIST[@]} == 10))" +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 + +cleanup diff --git a/tests/bugs/fuse/bug-858488-min-free-disk.t b/tests/bugs/fuse/bug-858488-min-free-disk.t index 635dc04d1e6..ab636575d3f 100644 --- a/tests/bugs/fuse/bug-858488-min-free-disk.t +++ b/tests/bugs/fuse/bug-858488-min-free-disk.t @@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2 ## Lets create volume TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}; +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1 ## Verify volume is created EXPECT "$V0" volinfo_field $V0 'Volume Name'; diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t index 9fc7ac3b845..3bc80ab9dab 100644 --- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t +++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t @@ -1,6 +1,6 @@ #!/bin/bash -## Test case for cluster.min-free-disk option validation. +## Test case for cluster.cluster.min-free-disk option validation. . $(dirname $0)/../../include.rc @@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2 TEST $CLI volume start $V0 ## Setting invalid value for option cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk 143.!/12 -TEST ! $CLI volume set $V0 min-free-disk 123% -TEST ! $CLI volume set $V0 min-free-disk 194.34% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12 +TEST ! $CLI volume set $V0 cluster.min-free-disk 123% +TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34% ## Setting fractional value as a size (unit is byte) for option ## cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk 199.051 -TEST ! $CLI volume set $V0 min-free-disk 111.999 +TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051 +TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999 ## Setting valid value for option cluster.min-free-disk should pass -TEST $CLI volume set $V0 min-free-disk 12% -TEST $CLI volume set $V0 min-free-disk 56.7% -TEST $CLI volume set $V0 min-free-disk 120 -TEST $CLI volume set $V0 min-free-disk 369.0000 +TEST $CLI volume set $V0 cluster.min-free-disk 12% +TEST $CLI volume set $V0 cluster.min-free-disk 56.7% +TEST $CLI volume set $V0 cluster.min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 369.0000 cleanup; diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t index c30d2b852d4..1b9ca18c08a 100755 --- a/tests/bugs/glusterd/bug-859927.t +++ b/tests/bugs/glusterd/bug-859927.t @@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " " TEST $CLI volume set $V0 min-free-inodes 60% EXPECT "60%" volume_option $V0 cluster.min-free-inodes -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk " " -TEST $CLI volume set $V0 min-free-disk 60% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk " " +TEST $CLI volume set $V0 cluster.min-free-disk 60% EXPECT "60%" volume_option $V0 cluster.min-free-disk -TEST $CLI volume set $V0 min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 120 EXPECT "120" volume_option $V0 cluster.min-free-disk TEST ! $CLI volume set $V0 frame-timeout "" diff --git a/tests/bugs/nfs/bug-1166862.t b/tests/bugs/nfs/bug-1166862.t index f986fe36ab7..fd57ccb992b 100755 --- a/tests/bugs/nfs/bug-1166862.t +++ b/tests/bugs/nfs/bug-1166862.t @@ -65,3 +65,7 @@ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0 EXPECT '0' count_lines cat $GLUSTERD_WORKDIR/nfs/rmtab cleanup + +# rmtab support permanently hacked out on FB branch. +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 diff --git a/tests/bugs/nfs/bug-904065.t b/tests/bugs/nfs/bug-904065.t index 0becb756da4..0d539a2341c 100755 --- a/tests/bugs/nfs/bug-904065.t +++ b/tests/bugs/nfs/bug-904065.t @@ -90,3 +90,7 @@ EXPECT '2' count_lines $M0/rmtab # rmtab. cleanup + +# rmtab support permanently hacked out on FB branch. +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 diff --git a/tests/bugs/quota/bug-1292020.t b/tests/bugs/quota/bug-1292020.t index 14b311c9d76..f713c74859b 100644 --- a/tests/bugs/quota/bug-1292020.t +++ b/tests/bugs/quota/bug-1292020.t @@ -4,10 +4,12 @@ . $(dirname $0)/../../volume.rc function write_sample_data () { - dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 | grep -i exceeded + dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 | + egrep -i 'exceeded|no space' && echo 'passed' } cleanup; +rm -f /tmp/kbv.log TEST glusterd; TEST pidof glusterd; @@ -18,7 +20,8 @@ TEST $CLI volume quota $V0 enable; TEST $CLI volume quota $V0 limit-usage / 1 TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0; -EXPECT "exceeded" write_sample_data + +EXPECT "passed" write_sample_data TEST $CLI volume stop $V0 TEST $CLI volume delete $V0 diff --git a/tests/bugs/replicate/bug-859581.t b/tests/bugs/replicate/bug-859581.t index d8b45a257a1..313067b6049 100755 --- a/tests/bugs/replicate/bug-859581.t +++ b/tests/bugs/replicate/bug-859581.t @@ -51,3 +51,5 @@ TEST $CLI volume delete $V0 cleanup +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 diff --git a/tests/cluster.rc b/tests/cluster.rc index 467bbcb06e1..42547f09e37 100644 --- a/tests/cluster.rc +++ b/tests/cluster.rc @@ -46,17 +46,18 @@ function define_glusterds() { bopt="management.transport.socket.bind-address=${!h}"; popt="--pid-file=${!b}/glusterd.pid"; sopt="management.glusterd-sockfile=${!b}/glusterd/gd.sock" + aopt="*.transport.address-family=inet" #Get the logdir logdir=`gluster --print-logdir` #Fetch the testcases name and prefix the glusterd log with it logfile=`echo ${0##*/}`_glusterd$i.log lopt="--log-file=$logdir/$logfile" if [ "$2" == "-LDEBUG" ]; then - eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; - eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; + eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; + eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; else - eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; - eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; + eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; + eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; fi done } diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6 new file mode 100644 index 00000000000..426b1ef5705 --- /dev/null +++ b/tests/configfiles/exports-v6 @@ -0,0 +1 @@ +/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) diff --git a/tests/env.rc.in b/tests/env.rc.in index 82971c4a8de..87befc3711d 100644 --- a/tests/env.rc.in +++ b/tests/env.rc.in @@ -28,3 +28,6 @@ export PYTHON PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH export PYTHONPATH + +TESTER_CFLAGS="@TESTER_CFLAGS@" +export TESTER_CFLAGS diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t new file mode 100755 index 00000000000..0fc5a241534 --- /dev/null +++ b/tests/features/brick-min-free-space.t @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Test storage.min-free-disk option works. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd + +TEST truncate -s 16M $B0/brick0 +TEST LOOPDEV=$(losetup --find --show $B0/brick0) +TEST mkfs.xfs $LOOPDEV + +mkdir -p $B0/$V0 + +TEST mount -t xfs $LOOPDEV $B0/$V0 + +########### +# AIO on # +########### + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio on + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +sleep 5 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +sleep 5 + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +############ +# AIO off # +############ + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +sleep 5 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +sleep 5 + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +cleanup; diff --git a/tests/features/lock_revocation.t b/tests/features/lock_revocation.t new file mode 100644 index 00000000000..cbf21b71650 --- /dev/null +++ b/tests/features/lock_revocation.t @@ -0,0 +1,52 @@ +#!/bin/bash +logdir=$(gluster --print-logdir) +BRICK_LOGFILES="$logdir/bricks/d-backends-brick?.log" +rm -f $BRICK_LOGFILES &> /dev/null + +# Test that lock revocation works + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +cleanup; + +function deadlock_fop() { + local MNT=$1 + for i in {1..1000}; do + dd if=/dev/zero of=$MNT/testfile bs=1k count=10 &> /dev/null + if grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null; then + break + fi + done +} + +function monkey_unlock() { + grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null && echo SUCCESS + return 0 +} + +function append_to_file() { + local FILE_PATH=$1 + echo "hello" >> $FILE_PATH + return 0 +} + +#Init +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} +TEST $CLI volume set $V0 self-heal-daemon off +TEST $CLI volume set $V0 features.locks-monkey-unlocking on +TEST $CLI volume set $V0 features.locks-revocation-secs 2 +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=$V0 -s $H0 $M0; +TEST $GFS --volfile-id=$V0 -s $H0 $M1; + +# Deadlock writes to a file using monkey unlocking +deadlock_fop $M0 & +EXPECT_WITHIN 60 "SUCCESS" monkey_unlock + +# Sleep > unlock timeout and attempt to write to the file +sleep 3 +TEST append_to_file $M1/testfile + +cleanup diff --git a/tests/halo.rc b/tests/halo.rc new file mode 100644 index 00000000000..4cb7c81da85 --- /dev/null +++ b/tests/halo.rc @@ -0,0 +1,52 @@ +# Return the current Halo state of a given child (by index, i.e. 0 +# is first child). +function halo_child_state { + grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG | + tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//' +} + +# Return number of Halo children which are in a given state. +# First parameter is total # children. +# Second parameter is state to match (e.g. "UP"). +function halo_children_in_state { + local CHILD_COUNT=$1 + local SUM=0 + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then + SUM=$((SUM+1)) + fi + done + echo $SUM +} + +# Return number of up halo children, +# First parameter is total # children, +function halo_children_up { + echo $(halo_children_in_state $1 "UP") +} + +# Return number of down halo children, +# First parameter is total # children, +function halo_children_down { + echo $(halo_children_in_state $1 "DOWN") +} + +# Return number of up & down halo children. +# First parameter is total number of children. +function halo_sum_child_states { + local CHILD_COUNT=$1 + + local UP=0 + local DOWN=0 + + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + local STATE=$(halo_child_state $CHILD) + if [ x"$STATE" == x"UP" ]; then + UP=$((UP+1)) + elif [ x"$STATE" == x"DOWN" ]; then + DOWN=$((DOWN+1)) + fi + done + + echo "$UP $DOWN" +} diff --git a/tests/include.rc b/tests/include.rc index 492e35a7b6c..8b6504e6c58 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -19,11 +19,13 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g CC=cc OSTYPE=$(uname -s) -ENV_RC=$(dirname $0)/../env.rc +M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point + +ENV_RC=$(dirname $0)/env.rc if [ ! -f $ENV_RC ]; then - ENV_RC=$(dirname $0)/../../env.rc + ENV_RC=$(dirname $0)/../env.rc if [ ! -f $ENV_RC ]; then - ENV_RC=$(dirname $0)/../../../env.rc + ENV_RC=$(dirname $0)/../../env.rc fi fi @@ -171,6 +173,7 @@ function test_footer() echo "FAILED COMMAND: $saved_cmd" fi if [ "$EXIT_EARLY" = "1" ]; then + cleanup exit $RET fi fi @@ -350,6 +353,7 @@ which killall > /dev/null || { which pidof > /dev/null || { pidof() { + $PYTHON pidof.py $@ } } @@ -422,11 +426,13 @@ stat -c %s /dev/null > /dev/null 2>&1 || { function cleanup() { + local OLDPWD=$PWD + cd # Things go pear-shaped if we're inside a Gluster mount. # Prepare flags for umount case `uname -s` in Linux) - flag="-l" + flag="-l -f --no-canonicalize" ;; NetBSD) flag="-f -R" @@ -573,6 +579,8 @@ function cleanup() # above to fail, promoting that into a failure of the whole test (and # thus of an entire regression-test run) seems a bit excessive. Make # sure we return good status anyway. + + cd $OLDPWD return 0 } @@ -612,6 +620,7 @@ function build_tester () then cflags="$cflags $(pkg-config glusterfs-api --cflags-only-I --libs-only-L)" fi + cflags="$cflags ${TESTER_CFLAGS}" $CC -g -o $(dirname $cfile)/$execname $cfile $cflags } @@ -1163,3 +1172,5 @@ function STAT_INO() echo 0 fi } + +systemctl stop nfs-mountd diff --git a/tests/nfs.rc b/tests/nfs.rc index 2140f311c33..ee52d96e6d3 100644 --- a/tests/nfs.rc +++ b/tests/nfs.rc @@ -23,7 +23,7 @@ function mount_nfs () local m=$2 local opt=$3 if [ ! -z "$opt" ]; then opt=",$opt"; fi - opt="soft,intr,vers=3$opt" + opt="soft,intr,nfsvers=3,proto=tcp$opt" nopt="" for o in ${opt//,/ }; do diff --git a/tests/volume.rc b/tests/volume.rc index f95c0013b2e..84630f3d4b4 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -237,6 +237,13 @@ function kill_brick { kill -9 $(get_brick_pid $vol $host $brick) } +function hang_brick { + local vol=$1 + local host=$2 + local brick=$3 + kill -STOP $(get_brick_pid $vol $host $brick) +} + function check_option_help_presence { local option=$1 $CLI volume set help | grep "^Option:" | grep -w $option diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 903fbb39f12..bce94bb8b3b 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht ec +SUBDIRS = aha stripe afr dht ec CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7532b014ff7..4c2343f8e9b 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -43,6 +43,13 @@ #include "afr-self-heald.h" #include "afr-messages.h" +#define CHILD_UP_STR "UP" +#define CHILD_DOWN_STR "DOWN" +#define CHILD_DISCONNECTED_STR "DOWN" + +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *fastest_children); + call_frame_t * afr_copy_frame (call_frame_t *base) { @@ -1078,7 +1085,8 @@ refresh_done: int afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error) { - call_frame_t *heal_frame = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal_frame = NULL; afr_local_t *local = NULL; gf_boolean_t start_heal = _gf_false; afr_local_t *heal_local = NULL; @@ -1092,13 +1100,15 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error) } local = frame->local; + priv = this->private; ret = afr_replies_interpret (frame, this, local->refreshinode, &start_heal); err = afr_inode_refresh_err (frame, this); - if (ret && afr_selfheal_enabled (this) && start_heal) { + if (priv->did_discovery == _gf_false || + (afr_selfheal_enabled (this) && start_heal)) { heal_frame = copy_frame (frame); if (!heal_frame) goto refresh_done; @@ -1380,6 +1390,12 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) "Unable to set list-xattr in dict "); } + ret = dict_set_int32 (xattr_req, GET_ANCESTRY_PATH_KEY, 42); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to set ancestry path key in dict "); + } + return ret; } @@ -1466,21 +1482,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) sizeof(gfid_copy)) % child_count; } +/* + * afr_halo_read_subvol + * + * Given a array representing the readable children, this function will + * return which one of the readable children meet the halo hybrid criteria. + * In the event none are found, -1 is returned and another strategy will have + * to be used to figure out where the read should come from. + */ +int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) { + afr_private_t *priv = NULL; + unsigned char *hybrid_children; + int32_t hybrid_cnt = 0; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* Halo in-active or hybrid mode disabled, bail.... */ + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return -1; + + /* AFR Discovery edge case, if you are already pinned to a child + * which meets the latency threshold then go with this child for + * consistency purposes. + */ + if (priv->read_child >= 0 && readable[priv->read_child] && + priv->child_latency[priv->read_child] <= + AFR_HALO_HYBRID_LATENCY_MSEC) { + return priv->read_child; + } + + hybrid_children = alloca0 (priv->child_count); + hybrid_cnt = find_hybrid_children (this, hybrid_children); + if (hybrid_cnt) { + for (i = 0; i < priv->child_count; i++) { + if (readable[i] && hybrid_children[i]) { + read_subvol = i; + priv->read_child = read_subvol; + gf_log (this->name, GF_LOG_TRACE, + "Selected hybrid child %d for reads", + i); + break; + } + } + } + + return read_subvol; +} + int afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, unsigned char *readable, afr_read_subvol_args_t *args) { - int i = 0; - int read_subvol = -1; - afr_private_t *priv = NULL; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; afr_read_subvol_args_t local_args = {0,}; - priv = this->private; + priv = this->private; - /* first preference - explicitly specified or local subvolume */ - if (priv->read_child >= 0 && readable[priv->read_child]) + /* Choose lowest latency child for reads */ + read_subvol = afr_halo_read_subvol (this, readable); + if (read_subvol != -1) + return read_subvol; + + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) return priv->read_child; if (inode_is_linked (inode)) { @@ -1506,7 +1576,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, return -1; } - int afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, unsigned char *readable, int *event_p, @@ -1697,6 +1766,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->readable); GF_FREE (local->readable2); + GF_FREE (local->heal_ancestry_path); + if (local->inode) inode_unref (local->inode); @@ -2166,6 +2237,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); priv->read_child = child_index; + } else if (priv->halo_enabled) { + if (priv->read_child < 0) { + priv->read_child = child_index; + } else if (priv->child_latency[child_index] < + priv->child_latency[priv->read_child]) { + priv->read_child = child_index; + } } out: STACK_DESTROY(frame->root); @@ -2357,7 +2435,6 @@ unwind: return 0; } - int afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { @@ -2523,6 +2600,8 @@ unwind: local->op_errno = ENOTCONN; } + priv->did_discovery = _gf_true; + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->replies[read_subvol].poststat, local->replies[read_subvol].xdata, @@ -2555,7 +2634,7 @@ afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->replies[child_index].xdata = dict_ref (xdata); } - if (local->do_discovery && (op_ret == 0)) + if (local->do_local_discovery && (op_ret == 0)) afr_attempt_local_discovery (this, child_index); if (xdata) { @@ -2583,6 +2662,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) afr_local_t *local = NULL; afr_private_t *priv = NULL; int call_count = 0; + unsigned char *hybrid_children = NULL; local = frame->local; priv = this->private; @@ -2593,8 +2673,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) goto out; } - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); + hybrid_children = alloca0 (priv->child_count); + call_count = find_hybrid_children (this, hybrid_children); + if (call_count) { + for (i = 0; i < priv->child_count; i++) + local->child_up[i] = hybrid_children[i]; + gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid " + "children for LOOKUPs", call_count); + } else { + hybrid_children = NULL; + call_count = AFR_COUNT (local->child_up, priv->child_count); + } + + local->call_count = call_count; ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, &local->loc); @@ -2648,12 +2739,12 @@ afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req if (!priv->root_inode) priv->root_inode = inode_ref (loc->inode); - if (priv->choose_local && !priv->did_discovery) { + if (priv->choose_local && !priv->did_local_discovery) { /* Logic to detect which subvolumes of AFR are local, in order to prefer them for reads */ - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; + local->do_local_discovery = _gf_true; + priv->did_local_discovery = _gf_true; } } @@ -2827,6 +2918,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get (loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); + /* So this is the "secret" to why "Hybrid" halo works. Encoded in + * the cached inodes, we store what is effectively the "generational" + * state of the cluster along with a "packed" version of the extended + * attributes which determine which nodes are wise/fools. We can + * consult these cached values to figure out who we can trust, in the + * event the state of our cluster changes and we can no longer trust + * the cached info we "refresh" the inode (and hit all regions) to + * ensure we know which bricks we can safely read from. + */ if (event != local->event_generation) afr_inode_refresh (frame, this, loc->parent, NULL, afr_lookup_do); @@ -3051,7 +3151,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4317,25 +4417,569 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +/* + * afr_cmp_child + * + * Passed to the qsort function to order a list of children by the latency + * and/or up/down states. + * + * Note: This isn't as simple as taking the latencies and calling it a + * a day. Children can be marked down, which overrides their latency + * signal. Having a lower-latency child available doesn't guarentee this + * child shall be marked up: we don't want to constantly be swapping + * slightly better bricks for others...this is jarring to clients and + * could cause all sorts of issues. Plus, the fail-over, max-replicas + * flags must all be honored which manage the up/down state of children. + * + * In short, the (as marked) up/down down state of the brick shall always + * take precedence when sorting by latency. + */ +static int +_afr_cmp_child (const void *child1, const void *child2) +{ + struct afr_child *child11 = (struct afr_child *)child1; + struct afr_child *child22 = (struct afr_child *)child2; + + /* If both children are _marked_ down they are equal */ + if (!child11->child_up && !child22->child_up) + return 0; + + /* Prefer child 2, child 1 is _marked_ down, child 2 is not */ + if (!child11->child_up && child22->child_up) + return 1; + + /* Prefer child 1, child 2 is _marked_ down, child 1 is not */ + if (child11->child_up && !child22->child_up) + return -1; + + if (child11->latency > child22->latency) { + return 1; + } + if (child11->latency == child22->latency) { + return 0; + } + return -1; +} + +/* + * find_hybrid_children + * + * Given a char array representing our children (aka bricks within our AFR + * AFR "subvolume"), we'll mark this array with the children which are + * within the halo_hybrid_read_max_latency_sec or if none fit this condition, + * we'll pick the fastest two bricks. + * + * You might ask, why not just pick the quickest brick and be done with it? + * Well, being within our set is not suffcient to be chosen for the read, + * we must also be marked "readable", we still want to choose as many as + * we can within our local region to ensure we have somebody that is readable. + * + * To illustrate this, consider the case where a 1/2 bricks received a sync + * from some other writer, and the 2nd brick although faster wasn't present. + * In this case we'll want to use the slower brick to service the read. + * + * In short, this function just tells the caller which hybrid children, + * it gives no signal as to their readability, nor should it since this is + * handled later in the various flows (e.g. by afr_halo_read_subvol). + */ +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *hybrid_children) +{ + int32_t i = 0; + afr_private_t *priv = NULL; + struct afr_child *sorted_list = NULL; + uint32_t max_latency; + uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT; + + priv = this->private; + + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return 0; + + if (limit > priv->child_count) + limit = priv->child_count; + + max_latency = priv->halo_hybrid_read_max_latency_msec; + + sorted_list = alloca (sizeof (struct afr_child) * priv->child_count); + + /* Find children meeting the latency threshold */ + for (i = 0; i < priv->child_count; i++) { + sorted_list[i].idx = i; + sorted_list[i].child_up = priv->child_up[i]; + sorted_list[i].latency = priv->child_latency[i]; + } + + /* QuickSort the children according to latency */ + qsort (sorted_list, priv->child_count, sizeof (struct afr_child), + _afr_cmp_child); + + i = 0; + while (i < priv->child_count && sorted_list[i].latency <= max_latency) + hybrid_children[sorted_list[i++].idx] = 1; + + /* Found some candidates */ + if (i != 0) + return i; + + /* If no candidates can be found meeting the max_latency threshold + * then find the best of those we have to our limit. + */ + for (i = 0; i < limit; i++) + hybrid_children[sorted_list[i].idx] = 1; + + return i; +} + +int +find_best_down_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) " + "@ %ld ms latency", best_child, best_latency); + } + return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] >= worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)" + " @ %ld ms latency", worst_child, worst_latency); + } + return worst_child; +} + +static const char *halo_state_str(int i) +{ + switch (i) { + case 0: return "DOWN"; + case 1: return "UP"; + } + + return "unknown"; +} + + +static void dump_halo_states (xlator_t *this) { + afr_private_t *priv = NULL; + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (N/A)", + i, + halo_state_str(priv->child_up[i])); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (%"PRIi64" ms)", + i, + halo_state_str(priv->child_up[i]), + priv->child_latency[i]); + } + } +} + +static void +_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, + const int idx, const int64_t halo_max_latency_msec, + int32_t *event, int64_t *child_latency_msec, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int best_down_child = 0; + uint64_t latency_samples = 0; + + priv = this->private; + + /* Base it off the _minimum_ latency we've ever seen */ + *child_latency_msec = child_xlator->client_latency.min / 1000.0; + latency_samples = child_xlator->client_latency.count; + priv->child_latency[idx] = *child_latency_msec; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] == 1) { + up_children++; + } + } + + /* Don't do anything until you have some minimum numbner of + * latency samples */ + if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) { + gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient " + " number of latency samples (%" PRIu64 + " < %d), halo in-active.", + latency_samples, priv->halo_min_samples); + } + + gf_log (this->name, GF_LOG_DEBUG, + "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)" + " up_count %d (min %d) enabled %s", + idx, child_xlator ? child_xlator->name : "<null>", + *child_latency_msec, + halo_max_latency_msec, + up_children, + priv->halo_min_replicas, + child_halo_enabled ? "true" : "false"); + + /* + * Case 1: This child's latency exceeds the maximum allowable + * for this halo. + */ + if (child_halo_enabled && + *child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && + up_children > priv->halo_min_replicas) { + if (find_worst_up_child (this) == idx) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "marking child down, " + "min_replicas (%d) still " + "satisfied.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_min_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + /* + * Case 2: Child latency is within halo and currently marked down, + * mark it up. + */ + } else if ((child_halo_enabled == _gf_false || + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 0) { + if (child_halo_enabled == _gf_false || + up_children < priv->halo_max_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "below halo threshold (%ld) or halo is " + "disabled, marking child up.", + *child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_UP; + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", idx, + priv->halo_max_replicas); + } + /* + * Case 3: Child latency is within halo,and currently marked up, + * mark it down if it's the highest latency child and the + * number of up children is greater than halo_max_replicas. + * UNLESS you are an SHD in which case do nothing. + */ + } else if ((child_halo_enabled == _gf_true && + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 1) { + if (find_worst_up_child (this) == idx && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "but halo_max_replicas (%d) exceeded, " + "marking child down.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_max_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + } + + if (*event != GF_EVENT_CHILD_PING && + gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, + int32_t *event, int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int worst_up_child = -1; + gf_boolean_t was_down = _gf_false; + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + /* + * Track the fact we did this, we may need to repeal this + * if we later decide to mark this brick down. + */ + was_down = _gf_true; + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (child_halo_enabled == _gf_true && + up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > + halo_max_latency_msec) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + priv->child_up[worst_up_child] = 0; + up_children--; + gf_log (this->name, GF_LOG_DEBUG, + "Marking child %d down, " + "doesn't meet halo threshold " + "(%ld), and > " + "halo_min_replicas (%d)", + worst_up_child, + halo_max_latency_msec, + priv->halo_min_replicas); + goto out; + } + } + if (priv->halo_enabled && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + worst_up_child = find_worst_up_child (this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + gf_log (this->name, GF_LOG_INFO, + "Marking child %d down, " + "up_children (%d) > " + "halo_max_replicas (%d)", + worst_up_child, + up_children, + priv->halo_max_replicas); + up_children--; + goto out; + } +out: + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, + int idx, int64_t child_latency_msec, + int64_t halo_max_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + gf_boolean_t swap_child = _gf_false; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to + * indicate the child is really disconnected. + */ + if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) { + priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY; + } + priv->child_up[idx] = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + best_down_child = find_best_down_child (this); + if (child_halo_enabled == _gf_true) { + if (up_children < priv->halo_min_replicas && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + else if (up_children < priv->halo_max_replicas && + priv->child_latency[best_down_child] <= + halo_max_latency_msec && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + } + + if (swap_child) { + if (best_down_child >= 0) { + gf_log (this->name, GF_LOG_INFO, + "Swapping out child %d for " + "child %d to satisfy " + "halo_min_replicas (%d).", + idx, best_down_child, + priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going " + "offline until atleast one of them " + "comes back up."); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +int64_t +_afr_get_halo_latency (xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = + priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld", + halo_max_latency_msec); + return halo_max_latency_msec; +} + + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) { + xlator_t *child_xlator = NULL; afr_private_t *priv = NULL; int i = -1; - int up_children = 0; - int down_children = 0; int propagate = 0; int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; int call_psh = 0; + int up_child = -1; + uint64_t latency_samples = 0; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY; + gf_boolean_t child_halo_enabled = _gf_false; + child_xlator = (xlator_t *)data; priv = this->private; if (!priv) @@ -4347,8 +4991,9 @@ afr_notify (xlator_t *this, int32_t event, * that we could end up issuing N lookups to the first subvolume, and * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. */ + priv->did_local_discovery = _gf_false; priv->did_discovery = _gf_false; - + latency_samples = child_xlator->client_latency.count; /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has @@ -4359,7 +5004,7 @@ afr_notify (xlator_t *this, int32_t event, * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index (this, data); + idx = find_child_index (this, child_xlator); if (idx < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -4368,6 +5013,28 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); + + if (!priv->halo_enabled || + latency_samples < priv->halo_min_samples) { + child_halo_enabled = _gf_false; + halo_max_latency_msec = INT64_MAX; + } else { + child_halo_enabled = _gf_true; + halo_max_latency_msec = _afr_get_halo_latency (this); + } + + if (event == GF_EVENT_CHILD_PING) { + /* Calculates the child latency and sets event + */ + LOCK (&priv->lock); + { + _afr_handle_ping_event (this, child_xlator, idx, + halo_max_latency_msec, &event, + &child_latency_msec, child_halo_enabled); + } + UNLOCK (&priv->lock); + } + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { @@ -4394,52 +5061,16 @@ afr_notify (xlator_t *this, int32_t event, propagate = 1; break; case GF_EVENT_CHILD_UP: - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_children = __afr_get_up_children_count (priv); - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - } else { - event = GF_EVENT_CHILD_MODIFIED; - } - - priv->last_event[idx] = event; - + _afr_handle_child_up_event (this, child_xlator, + idx, halo_max_latency_msec, &event, &call_psh, + &up_child, child_halo_enabled); break; case GF_EVENT_CHILD_DOWN: - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_ALL_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - } else { - event = GF_EVENT_SOME_CHILD_DOWN; - } - - priv->last_event[idx] = event; - + _afr_handle_child_down_event (this, child_xlator, idx, + child_latency_msec, halo_max_latency_msec, + &event, &call_psh, &up_child, + child_halo_enabled); break; case GF_EVENT_CHILD_CONNECTING: @@ -4466,7 +5097,6 @@ afr_notify (xlator_t *this, int32_t event, had come up, propagate CHILD_UP, but only this time */ event = GF_EVENT_CHILD_DOWN; - up_children = __afr_get_up_children_count (priv); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -4542,7 +5172,7 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) local->call_count = AFR_COUNT (local->child_up, priv->child_count); if (local->call_count == 0) { gf_msg (THIS->name, GF_LOG_INFO, 0, - AFR_MSG_ALL_SUBVOLS_DOWN, "no subvolumes up"); + AFR_MSG_ALL_SUBVOLS_DOWN, "no bricks up"); if (op_errno) *op_errno = ENOTCONN; goto out; diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 2b369ca3c68..a917bc08ae0 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1538,6 +1538,15 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; /* + * Heal daemons don't have IO threads ... and as a result they + * send this getxattr down and eventually crash :( + */ + if (strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { + ret = -EINVAL; + goto out; + } + + /* * Special xattrs which need responses from all subvols */ if (afr_is_special_xattr (name, &cbk, 0)) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7f7962013d7..c7d6261b110 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -46,7 +46,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_spbc_timeout_t, gf_afr_mt_spb_status_t, gf_afr_mt_empty_brick_t, - gf_afr_mt_end + gf_afr_mt_child_latency_t, + gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index adf5ab20a6c..629f1c6a7da 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -664,6 +664,20 @@ out: } +static int +replies_are_same (struct afr_reply *replies, int i, int k) +{ + if (replies[k].poststat.ia_mtime != replies[i].poststat.ia_mtime) { + return _gf_false; + } + if (replies[k].poststat.ia_size != replies[i].poststat.ia_size) { + return _gf_false; + } + + return gf_uuid_compare (replies[i].poststat.ia_gfid, + replies[k].poststat.ia_gfid) == 0; +} + int afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, inode_t *inode) @@ -683,14 +697,10 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, priv->children[i]->name, replies[i].poststat.ia_mtime, replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); vote_count = 0; - for (k = 0; k < priv->child_count; k++) { - if ((replies[k].poststat.ia_mtime == - replies[i].poststat.ia_mtime) && - (replies[k].poststat.ia_size == - replies[i].poststat.ia_size) - ) { + for (k = 1; k < priv->child_count; k++) { + if (replies_are_same (replies, i, k)) { vote_count++; } } @@ -724,7 +734,7 @@ afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode) priv->children[i]->name, replies[i].poststat.ia_mtime, replies[i].poststat.ia_mtime_nsec, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_mtime > cmp_mtime) { cmp_mtime = replies[i].poststat.ia_mtime; cmp_mtime_nsec = @@ -764,7 +774,7 @@ afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode) priv->children[i]->name, replies[i].poststat.ia_ctime, replies[i].poststat.ia_ctime_nsec, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_ctime > cmp_ctime) { cmp_ctime = replies[i].poststat.ia_ctime; cmp_ctime_nsec = @@ -802,7 +812,7 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) "file size = %lu for gfid %s", priv->children[i]->name, replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); + uuid_utoa (replies[i].poststat.ia_gfid)); if (replies[i].poststat.ia_size > cmp_sz) { cmp_sz = replies[i].poststat.ia_size; fav_child = i; @@ -901,7 +911,7 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame, "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, " "%s ctime).", priv->children[fav_child]->name, - uuid_utoa (inode->gfid), + uuid_utoa (replies[fav_child].poststat.ia_gfid), policy_str, replies[fav_child].poststat.ia_size, mtime_str, @@ -929,6 +939,7 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies, + afr_transaction_type type) { afr_local_t *local = NULL; @@ -1201,7 +1212,6 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, } } - /* count the number of dirty fops witnessed */ for (i = 0; i < priv->child_count; i++) witness[i] += dirty[i]; @@ -1209,6 +1219,67 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, return 0; } +/* + * This function will examine a reply and look for a PGFID xattr + * and if found will record this in the frame's local struct. + * + * This can then be used to fall-back to healing the parent + * directory in cases where metadata/data healing isn't yet + * possible because an entry heal of the parent directory has not + * yet taken place. + * + * This is critical for a couple reasons: + * 1. General healing predictability - When the SHD + * attempts to heal a given GFID, it should be able + * to do so without having to wait for some other + * dependent heal to take place. + * 2. Reliability - In some cases the parent directory + * may require healing, but the req'd entry in the + * indices/xattrop directory may not exist + * (e.g. bugs/crashes etc). This feature removes + * + */ +void +_afr_set_heal_pgfid_from_reply (xlator_t *this, afr_local_t *local, + struct afr_reply reply) +{ + data_pair_t *trav = reply.xdata->members_list; + uuid_t *pgfid = NULL; + int32_t ret = 0; + int32_t pgfid_prefix_len = sizeof (PGFID_XATTR_KEY_PREFIX) - 1; + char *pgfid_str = NULL; + data_t *ancestry_path_data = NULL; + char *ancestry_path = "Unknown"; + + pgfid = &local->heal_pgfid; + + while (trav) { + if (!strncmp (PGFID_XATTR_KEY_PREFIX, trav->key, + pgfid_prefix_len)) { + pgfid_str = trav->key + pgfid_prefix_len; + ret = gf_uuid_parse (pgfid_str, *pgfid); + break; + } + trav = trav->next; + } + + if (!ret && !gf_uuid_is_null (*pgfid)) { + if (!dict_lookup (reply.xdata, + "glusterfs.ancestry.path", + &ancestry_path_data)) { + ancestry_path = data_to_str ( + ancestry_path_data); + /* Allocation free'd on local destroy */ + local->heal_ancestry_path = + gf_strdup (ancestry_path); + } + gf_log (this->name, GF_LOG_DEBUG, + "Found pgfid (%s) for %s", + uuid_utoa (*pgfid), + ancestry_path); + } +} + void afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, int source, unsigned char *sources, @@ -1239,7 +1310,6 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, } } } - if (ret < 0) { status = "Failed"; loglevel = GF_LOG_DEBUG; @@ -1777,6 +1847,8 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, gf_boolean_t *entry_selfheal) { afr_private_t *priv = NULL; + afr_local_t *local = NULL; + inode_t *inode = NULL; int i = 0; int valid_cnt = 0; @@ -1785,6 +1857,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, int ret = -1; priv = this->private; + local = frame->local; inode = afr_inode_find (this, gfid); if (!inode) @@ -1802,6 +1875,10 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, if (replies[i].op_ret == -1) continue; + if (gf_uuid_is_null(local->heal_pgfid)) + _afr_set_heal_pgfid_from_reply (this, + frame->local, replies[i]); + /* The data segment of the changelog can be non-zero to indicate * the directory needs a full heal. So the check below ensures * it's not a directory before setting the data_selfheal boolean. @@ -1814,8 +1891,11 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, afr_is_metadata_set (this, replies[i].xdata)) *metadata_selfheal = _gf_true; - if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata)) - *entry_selfheal = _gf_true; + if ((!priv->shd.iamshd && AFR_IS_ROOT_GFID (gfid) && + priv->did_discovery == _gf_false) || + (entry_selfheal && + afr_is_entry_set (this, replies[i].xdata))) + *entry_selfheal = _gf_true; valid_cnt++; if (valid_cnt == 1) { @@ -1831,8 +1911,14 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, (int) replies[i].poststat.ia_type, priv->children[i]->name, uuid_utoa (replies[i].poststat.ia_gfid)); - ret = -EIO; - goto out; + + if (priv->gfid_splitbrain_forced_heal && + metadata_selfheal) { + *metadata_selfheal = _gf_true; + } else { + ret = -EIO; + goto out; + } } if (!IA_EQUAL (first, replies[i].poststat, uid)) { @@ -1875,6 +1961,15 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, *metadata_selfheal = _gf_true; } + /* Force entry healing of directories for SHDs regardless + * of the entry healing portion of the change log. + */ + if (IA_ISDIR(first.ia_type) && priv->shd.iamshd && + IA_EQUAL (first, replies[i].poststat, type) && + entry_selfheal) { + *entry_selfheal = _gf_true; + } + if (IA_ISREG(first.ia_type) && !IA_EQUAL (first, replies[i].poststat, size)) { gf_msg_debug (this->name, 0, @@ -1970,6 +2065,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, { int ret = 0; int i = 0; + int source_count = 0; afr_private_t *priv = NULL; dict_t *xattr = NULL; int **changelog = NULL; @@ -1990,12 +2086,27 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, goto out; } - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - ret |= afr_selfheal_post_op (frame, this, inode, i, xattr, + /* Pre-compute how many sources we have, if we made it in here + * without any sources defined, we are doing a conservative + * merge + */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_count++; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* If there are no sources we are doing a conservative + * merge. In such a case ensure we mark the changelog + * on all replicas. + */ + if (!sources[i] && source_count) { + continue; + } + ret |= afr_selfheal_post_op (frame, this, inode, i, xattr, NULL); - } + } out: if (changelog) afr_matrix_cleanup (changelog, priv->child_count); @@ -2029,6 +2140,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) &data_selfheal, &metadata_selfheal, &entry_selfheal); + if (ret) goto out; @@ -2075,10 +2187,19 @@ int afr_selfheal (xlator_t *this, uuid_t gfid) { int ret = -1; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; + gf_boolean_t tried_parent = _gf_false; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + char *ancestry_path = "Unknown"; + char *pgfid_str = NULL; + char *gfid_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + +heal_gfid: + frame = afr_frame_create (this); - frame = afr_frame_create (this); if (!frame) return ret; @@ -2087,6 +2208,47 @@ afr_selfheal (xlator_t *this, uuid_t gfid) ret = afr_selfheal_do (frame, this, gfid); + if (priv->pgfid_self_heal == _gf_true && + tried_parent == _gf_false && (ret != 0 || ret != 2) && + !gf_uuid_is_null (local->heal_pgfid)) { + tried_parent = _gf_true; + pgfid_str = alloca (strlen (UUID0_STR) + 1); + gfid_str = alloca (strlen (UUID0_STR) + 1); + uuid_utoa_r (local->heal_pgfid, pgfid_str); + uuid_utoa_r (gfid, gfid_str); + if (local->heal_ancestry_path) + ancestry_path = local->heal_ancestry_path; + gf_log (this->name, GF_LOG_INFO, + "PGFID Healing - Heal failed for %s (%s), " + "but found parent gfid (%s), attempting to heal " + "parent directory by gfid.", + gfid_str, + ancestry_path, + pgfid_str); + ret = afr_selfheal (this, local->heal_pgfid); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "PGFID Healing - Healing of parent gfid " + "(%s) unsuccessful! Healing of %s (%s) " + "failed.", + pgfid_str, + gfid_str, + ancestry_path); + } else { + gf_log (this->name, GF_LOG_INFO, + "PGFID Healing - Healing of parent gfid %s " + "successful! Re-attempting heal of %s (%s).", + pgfid_str, + gfid_str, + ancestry_path); + if (frame) { + AFR_STACK_DESTROY (frame); + frame = NULL; + } + goto heal_gfid; + } + } + if (frame) AFR_STACK_DESTROY (frame); @@ -2230,3 +2392,19 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources, out: return source; } + +void +afr_sh_get_source_by_policy (xlator_t *this, + unsigned char *sources, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, inode_t *inode) +{ + int fav_child = -1; + char *policy_str; + + fav_child = afr_sh_get_fav_by_policy (this, replies, inode, + &policy_str); + sources[fav_child] = 1; + healed_sinks[fav_child] = 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index cf03a9ec680..c1e945bfd82 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -324,7 +324,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; off_t off = 0; - size_t block = 128 * 1024; + size_t block = 0; int type = AFR_SELFHEAL_DATA_FULL; int ret = -1; call_frame_t *iter_frame = NULL; @@ -336,6 +336,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, healed_sinks[ARBITER_BRICK_INDEX] = 0; } + block = 128 * 1024 * priv->data_self_heal_window_size; + type = afr_data_self_heal_type_get (priv, healed_sinks, source, replies); @@ -716,7 +718,6 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, goto unlock; ret = 0; - } unlock: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, @@ -752,7 +753,6 @@ skip_undo_pending: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, data_lock); out: - if (did_sh) afr_log_selfheal (fd->inode->gfid, this, ret, "data", source, sources, healed_sinks); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 5b536b0ded8..25f8ea313aa 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -16,16 +16,6 @@ #include "afr-messages.h" #include "syncop-utils.h" -/* Max file name length is 255 this filename is of length 256. No file with - * this name can ever come, entry-lock with this name is going to prevent - * self-heals from older versions while the granular entry-self-heal is going - * on in newer version.*/ -#define LONG_FILENAME "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - static int afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name, inode_t *inode, int child, struct afr_reply *replies) @@ -66,7 +56,30 @@ afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name, ret = syncop_unlink (subvol, &loc, NULL, NULL); break; } - } + /* Handle edge case where directories exist in a partially + * created state: empty, without a gfid assigned. We need to + * remove these bad dirs so the normal entry heal process + * can take place. + */ + } else if (replies[child].valid && + replies[child].op_ret == -1 && + replies[child].op_errno == ENODATA && + gf_uuid_is_null (replies[child].poststat.ia_gfid)) { + if (replies[child].poststat.ia_type == IA_INVAL) { + gf_log (this->name, GF_LOG_WARNING, + "expunging orphaned (gfid-less) dir " + "%s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, + g), subvol->name); + /* We will only do this for _directories_, and this + * will only succeed for directories _without_ + * data. The file case is handled well already + * through the metadata self-heal process. + */ + ret = syncop_rmdir (subvol, &loc, 1, NULL, NULL); + } + } loc_wipe (&loc); @@ -299,11 +312,12 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, } } + /* Returning EIO here isn't needed if GFID forced heal is + * enabled. + */ /* In case of a gfid or type mismatch on the entry, return -1.*/ - ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies, - fd->inode->gfid, - name, source); - + ret = afr_selfheal_detect_gfid_and_type_mismatch (this, + replies, fd->inode->gfid, name, source); if (ret < 0) return ret; @@ -314,10 +328,20 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, if (replies[i].op_errno != ENOENT) continue; - ret = afr_selfheal_recreate_entry (frame, i, source, sources, - fd->inode, name, inode, - replies); - } + /* Re-create the entry in the event the child + * does not have it, or the entry does not have + * a gfid. In the latter case we'll only do + * this for now if it's directory, this can be + * widened to include files at a later time. + */ + if (replies[i].op_errno == ENOENT || + (replies[i].op_errno == ENODATA && + gf_uuid_is_null (replies[i].poststat.ia_gfid))) { + ret = afr_selfheal_recreate_entry ( + frame, i, source, sources, fd->inode, name, inode, + replies); + } + } return ret; } @@ -435,7 +459,9 @@ __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, sources_count = AFR_COUNT (sources, priv->child_count); if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) - || !sources_count || afr_does_witness_exist (this, witness)) { + || !sources_count || afr_does_witness_exist (this, witness) + || (sources_count == priv->child_count && + priv->did_discovery == _gf_false)) { memset (sources, 0, sizeof (*sources) * priv->child_count); afr_mark_active_sinks (this, sources, locked_on, healed_sinks); @@ -652,7 +678,6 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; afr_private_t *priv = NULL; gf_boolean_t mismatch = _gf_false; - afr_local_t *iter_local = NULL; afr_local_t *local = NULL; loc_t loc = {0,}; @@ -685,10 +710,34 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) continue; + /* Common Case: First do a cheap normal entry_dirent + * flow */ ret = afr_selfheal_entry_dirent (iter_frame, this, fd, entry->d_name, loc.inode, subvol, local->need_full_crawl); + + /* Edge Case: Do name heal to fix gfid split + * brains and other damage to directory + * entries. + */ + if (ret) { + /* If the cheap flow didn't work, let's head + * into the name self-heal flow. Here we'll + * inspect for GFID split-brains and fix if + * found. Then send it back to the normal + * entry_dirent flow. + */ + ret = afr_selfheal_name (this, fd->inode->gfid, + entry->d_name, NULL); + if (!ret) { + ret = afr_selfheal_entry_dirent ( + iter_frame, this, fd, + entry->d_name, loc.inode, subvol, + local->need_full_crawl); + } + } + AFR_STACK_RESET (iter_frame); if (iter_frame->local == NULL) { ret = -ENOTCONN; @@ -1045,45 +1094,22 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tie_breaker_entrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); - { - if (ret < AFR_SH_MIN_PARTICIPANTS) { - gf_msg_debug (this->name, 0, "%s: Skipping " - "entry self-heal as only %d sub-volumes could " - "be locked in %s domain", - uuid_utoa (fd->inode->gfid), ret, - priv->sh_domain); - /* Either less than two subvols available, or another - selfheal (from another server) is in progress. Skip - for now in any case there isn't anything to do. - */ - ret = -ENOTCONN; - goto unlock; - } + if (ret < AFR_SH_MIN_PARTICIPANTS) { + gf_msg_debug (this->name, 0, "%s: Skipping " + "entry self-heal as only %d sub-volumes could " + "be locked in %s domain", + uuid_utoa (fd->inode->gfid), ret, + priv->sh_domain); + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry (frame, this, fd, locked_on); - if (!granular_locks) { - ret = afr_selfheal_tryentrylk (frame, this, inode, - this->name, LONG_FILENAME, - long_name_locked); - } - { - if (!granular_locks && ret < 1) { - gf_msg_debug (this->name, 0, "%s: Skipping" - " entry self-heal as only %d " - "sub-volumes could be " - "locked in special-filename " - "domain", - uuid_utoa (fd->inode->gfid), - ret); - ret = -ENOTCONN; - goto unlock; - } - ret = __afr_selfheal_entry (frame, this, fd, locked_on); - } - if (!granular_locks) - afr_selfheal_unentrylk (frame, this, inode, this->name, - LONG_FILENAME, long_name_locked, - NULL); - } unlock: afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on, NULL); diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index db1b1cc889f..4570ace7ef7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -216,6 +216,17 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, sources_count = AFR_COUNT (sources, priv->child_count); + /* __afr_selfheal_metadata_prepare tinkers with the state + * of healed_sinks pre-maturely (the source hasn't + * actually been finalized yet!), so reset the children + * which aren't our source to sinks so we can heal. + * I'll leave it to the AFR2 maintainer to fix that code + * in the future as they may have had a good reason. + */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] && locked_on[i]) + healed_sinks[i] = 1; + } if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index b28ce4170f1..9ca56f8bd9d 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -13,6 +13,288 @@ #include "afr-self-heal.h" #include "afr-messages.h" + + +/* + * Helper function to create the destination location for the copy + * of the directory entry we are moving out of the way. + */ +static int +_afr_sh_create_unsplit_loc (struct afr_reply *replies, const int child_idx, + loc_t *loc, loc_t *unsplit_loc) +{ + int ret = 0; + int new_path_len = 0; + int new_name_len = 0; + char *new_path = NULL; + char *new_name = NULL; + char *tmp_gfid_str; + const char *filename = NULL; + uuid_t rand_uuid; + + tmp_gfid_str = alloca (sizeof (UUID0_STR)); + + /* + * All of these allocations will be cleaned up + * @ afr_sh_gfid_unsplit_rename_done via loc_wipe. + */ + if (loc_copy (unsplit_loc, loc)) { + ret = EINVAL; + goto err; + } + + inode_unref (unsplit_loc->inode); + unsplit_loc->inode = inode_new (loc->inode->table); + unsplit_loc->parent = inode_ref (loc->parent); + gf_uuid_copy (unsplit_loc->inode->gfid, + replies[child_idx].poststat.ia_gfid); + unsplit_loc->inode->ia_type = loc->inode->ia_type; + + gf_uuid_generate (rand_uuid); + /* Note: Use re-entrant version of uuid_utoa! */ + tmp_gfid_str = uuid_utoa_r (rand_uuid, tmp_gfid_str); + + /* Copy the GFIDs, file + parent directory */ + gf_uuid_copy (unsplit_loc->gfid, rand_uuid); + gf_uuid_copy (unsplit_loc->pargfid, + replies[child_idx].postparent.ia_gfid); + + filename = loc->name; + + /* + * New path: Add 11 for null + ".unsplit_" + "_". We _could_ nuke + * tmp_gfid_str entirely here, iff we assume the uuid_utoa + * formatting to _never_change. If we assume this we can just add + * 32 to the length and call uuid_utoa directly in the snprintf. + */ + new_path_len = strlen (filename) + strlen (tmp_gfid_str) + 11; + new_path = GF_CALLOC (1, new_path_len, gf_common_mt_char); + if (!new_path) { + ret = ENOMEM; + goto err; + } + snprintf (new_path, new_path_len, ".unsplit_%s_%s", tmp_gfid_str, + filename); + unsplit_loc->path = new_path; + + /* New name: Add 11 for null + ".unsplit_" + "_" */ + new_name_len = strlen (loc->name) + strlen (tmp_gfid_str) + 11; + new_name = GF_CALLOC (1, new_name_len, gf_common_mt_char); + if (!new_name) { + ret = ENOMEM; + goto err; + } + snprintf (new_name, new_name_len, ".unsplit_%s_%s", tmp_gfid_str, + loc->name); + unsplit_loc->name = new_name; + + return 0; +err: + GF_FREE (new_path); + GF_FREE (new_name); + return ret; +} + +static int +_afr_gfid_unsplit_rename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + + local = frame->local; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "rename entry %s/%s failed, on child %d reason, %s", + uuid_utoa (local->loc.pargfid), + local->loc.name, child_index, strerror (op_errno)); + } + gf_log (this->name, GF_LOG_DEBUG, + "GFID unsplit successful on %s/%s, on child %d", + uuid_utoa (local->loc.pargfid), local->loc.name, child_index); + + syncbarrier_wake (&local->barrier); + return 0; +} +int +__afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on, + struct afr_reply *replies, inode_t *inode, + loc_t *loc) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + loc_t *unsplit_loc; + unsigned int i = 0; + unsigned int split_count = 0; + unsigned char *rename_list; + int ret = 0; + + frame = afr_frame_create (this); + + local = frame->local; // Local variables for our frame + priv = this->private; // xlator specific variables + rename_list = alloca0 (priv->child_count); + + if (loc_copy (&local->loc, loc)) { + ret = ENOMEM; + goto out; + } + + /* Pre-compute the number of rename calls we will be doing */ + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] && + !gf_uuid_is_null (replies[i].poststat.ia_gfid) && + gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) { + split_count++; + } + } + + gf_log (this->name, GF_LOG_INFO, "Found %d split-brained gfid's.", + split_count); + + local->unsplit_locs = GF_CALLOC (priv->child_count, + sizeof (*unsplit_loc), gf_afr_mt_loc_t); + if (!local->unsplit_locs) { + ret = ENOMEM; + goto out; + } + + afr_local_replies_wipe (local, priv); + local->call_count = 0; + for (i = 0; i < priv->child_count; i++) { + unsplit_loc = &local->unsplit_locs[i]; + if (locked_on[i] && local->child_up[i] && + replies[i].op_errno != ENOENT && + !gf_uuid_is_null (replies[i].poststat.ia_gfid) && + gf_uuid_compare (replies[i].poststat.ia_gfid, loc->gfid)) { + ret = _afr_sh_create_unsplit_loc (replies, i, + loc, unsplit_loc); + gf_log (this->name, GF_LOG_INFO, "Renaming child %d to " + " %s/%s to resolve gfid split-brain.", i, + uuid_utoa (unsplit_loc->pargfid), + unsplit_loc->name); + rename_list[i] = 1; + /* frame, rfn, cky, obj, fn, params */ + STACK_WIND_COOKIE (frame, + _afr_gfid_unsplit_rename_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + loc, unsplit_loc, NULL); + local->call_count++; + } + } + syncbarrier_wait (&local->barrier, local->call_count); + +out: + for (i = 0; i < priv->child_count; i++) { + if (rename_list[i]) + loc_wipe (&local->unsplit_locs[i]); + } + if (frame) + AFR_STACK_DESTROY (frame); + return ret; +} + +int +__afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid, + const char *bname, inode_t *inode, + struct afr_reply *replies, void *gfid, + unsigned char *locked_on) +{ + int ret = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + loc_t loc = {0, }; + call_frame_t *new_frame = NULL; + afr_local_t *new_local = NULL; + int fav_child = -1; + unsigned char *fav_gfid; + char *policy_str; + + priv = this->private; + + new_frame = afr_frame_create (this); + if (!new_frame) { + ret = -ENOMEM; + goto out; + } + + new_local = new_frame->local; + + gf_uuid_copy (parent->gfid, pargfid); + + loc.parent = inode_ref (parent); + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + + /* + * Ok, go find our favorite child by one of the active policies: + * majority -> ctime -> mtime -> size -> predefined + * we'll use this gfid as the "real" one. + */ + fav_child = afr_sh_get_fav_by_policy (this, replies, inode, + &policy_str); + if (fav_child == -1) { /* No policies are in place, bail */ + gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID " + "split brain, there are no favorite child policies " + "set."); + ret = -EIO; + goto out; + } + fav_gfid = replies[fav_child].poststat.ia_gfid; + gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid " + "split-brain. GFID is %s.", fav_child, uuid_utoa (fav_gfid)); + + gf_uuid_copy (loc.gfid, fav_gfid); + ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies, + inode, &loc); + + if (ret) + goto out; + + xdata = dict_new (); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_static_bin (xdata, "gfid-req", fav_gfid, 16); + if (ret) { + ret = -ENOMEM; + goto out; + } + + /* Clear out old replies here and wind lookup on all locked + * subvolumes to achieve two things: + * a. gfid heal on those subvolumes that do not have gfid associated + * with the inode, and + * b. refresh replies, which can be consumed by + * __afr_selfheal_name_impunge(). + */ + afr_replies_wipe (replies, priv->child_count); + /* This sends out lookups to all bricks and blocks once we have + * them. + */ + AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup, + &loc, xdata); + afr_replies_copy (replies, new_local->replies, priv->child_count); +out: + loc_wipe (&loc); + if (xdata) + dict_unref (xdata); + if (new_frame) + AFR_STACK_DESTROY (new_frame); + + return ret; +} + int __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid, const char *bname, inode_t *inode, @@ -28,6 +310,7 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid, loc_t loc = {0, }; call_frame_t *new_frame = NULL; afr_local_t *new_local = NULL; + int i; priv = this->private; @@ -83,6 +366,25 @@ __afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid, * __afr_selfheal_name_impunge(). */ + gf_log (this->name, GF_LOG_INFO, + "smashing gfid to %s", uuid_utoa(gfid)); + + ia_type_t ia_type = replies[0].poststat.ia_type; + for (i = 1; i < priv->child_count; ++i) { + if (replies[i].poststat.ia_type != ia_type) { + if (replies[i].poststat.ia_type == IA_INVAL) { + continue; + } + gf_log (this->name, GF_LOG_WARNING, + "type[%d] = %d (not %d)", i, + replies[i].poststat.ia_type, ia_type); + if (ia_type != IA_INVAL) { + ret = -EIO; + goto out; + } + ia_type = replies[i].poststat.ia_type; + } + } AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); @@ -266,52 +568,6 @@ afr_selfheal_name_need_heal_check (xlator_t *this, struct afr_reply *replies) return need_heal; } -static int -afr_selfheal_name_type_mismatch_check (xlator_t *this, struct afr_reply *replies, - int source, unsigned char *sources, - uuid_t pargfid, const char *bname) -{ - int i = 0; - int type_idx = -1; - ia_type_t inode_type = IA_INVAL; - afr_private_t *priv = NULL; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - - if (replies[i].poststat.ia_type == IA_INVAL) - continue; - - if (inode_type == IA_INVAL) { - inode_type = replies[i].poststat.ia_type; - type_idx = i; - continue; - } - - if (sources[i] || source == -1) { - if ((sources[type_idx] || source == -1) && - (inode_type != replies[i].poststat.ia_type)) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SPLIT_BRAIN, - "Type mismatch for <gfid:%s>/%s: " - "%d on %s and %d on %s", - uuid_utoa(pargfid), bname, - replies[i].poststat.ia_type, - priv->children[i]->name, - replies[type_idx].poststat.ia_type, - priv->children[type_idx]->name); - - return -EIO; - } - inode_type = replies[i].poststat.ia_type; - type_idx = i; - } - } - return 0; -} static int afr_selfheal_name_gfid_mismatch_check (xlator_t *this, struct afr_reply *replies, @@ -408,7 +664,10 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, gf_boolean_t source_is_empty = _gf_true; gf_boolean_t need_heal = _gf_false; gf_boolean_t is_gfid_absent = _gf_false; + gf_boolean_t tried_gfid_unsplit = _gf_false; + afr_private_t *priv = NULL; + priv = this->private; need_heal = afr_selfheal_name_need_heal_check (this, replies); if (!need_heal) return 0; @@ -424,18 +683,16 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, return ret; } - ret = afr_selfheal_name_type_mismatch_check (this, replies, source, - sources, pargfid, bname); - if (ret) - return ret; - +gfid_mismatch_check: ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source, sources, &gfid_idx, pargfid, bname); - if (ret) + + if (ret && tried_gfid_unsplit) { return ret; + } - if (gfid_idx == -1) { + if (gfid_idx == -1) { if (!gfid_req || gf_uuid_is_null (gfid_req)) return -1; gfid = gfid_req; @@ -443,12 +700,24 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, gfid = &replies[gfid_idx].poststat.ia_gfid; } + if (priv->gfid_splitbrain_forced_heal || ret) { + ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid, + bname, inode, replies, gfid, locked_on); + + if (ret) + return ret; + + tried_gfid_unsplit = _gf_true; + goto gfid_mismatch_check; + } + is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false; - ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode, - replies, gfid, locked_on, - is_gfid_absent); - if (ret) + ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, + inode, replies, gfid, + locked_on, is_gfid_absent); + if (ret) { return ret; + } if (gfid_idx == -1) { gfid_idx = afr_selfheal_gfid_idx_get (this, replies, sources); diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c6ac5ebfd1b..4ac1d32f58a 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -58,6 +58,7 @@ typedef struct { eh_t **statistics; uint32_t max_threads; uint32_t wait_qlength; + uint32_t halo_max_latency_msec; } afr_self_heald_t; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index af81b77ddb6..86f667116af 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -143,6 +143,10 @@ reconfigure (xlator_t *this, dict_t *options) priv->metadata_splitbrain_forced_heal, options, bool, out); + GF_OPTION_RECONF ("gfid-splitbrain-forced-heal", + priv->gfid_splitbrain_forced_heal, options, bool, + out); + GF_OPTION_RECONF ("background-self-heal-count", priv->background_self_heal_count, options, uint32, out); @@ -160,6 +164,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, bool, out); + GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal, + options, bool, out); + GF_OPTION_RECONF ("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); @@ -176,6 +183,42 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); + GF_OPTION_RECONF ("halo-enabled", + priv->halo_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-failover-enabled", + priv->halo_failover_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-shd-max-latency", + priv->shd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, + options, uint32, out); + + GF_OPTION_RECONF ("halo-hybrid-mode", + priv->halo_hybrid_mode, options, bool, + out); + + GF_OPTION_RECONF ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, options, + uint32, out); + + GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options, + uint32, out); + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -256,6 +299,7 @@ reconfigure (xlator_t *this, dict_t *options) if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1) goto out; + priv->did_local_discovery = _gf_false; priv->did_discovery = _gf_false; ret = 0; @@ -327,6 +371,9 @@ init (xlator_t *this) GF_OPTION_INIT ("metadata-splitbrain-forced-heal", priv->metadata_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("gfid-splitbrain-forced-heal", + priv->gfid_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); if (read_subvol) { priv->read_child = xlator_subvolume_index (this, read_subvol); @@ -377,6 +424,8 @@ init (xlator_t *this) GF_OPTION_INIT ("shd-wait-qlength", priv->shd.wait_qlength, uint32, out); + GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out); + GF_OPTION_INIT ("background-self-heal-count", priv->background_self_heal_count, uint32, out); @@ -396,6 +445,35 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT ("halo-hybrid-mode", + priv->halo_hybrid_mode, bool, out); + + GF_OPTION_INIT ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, uint32, + out); + + GF_OPTION_INIT ("halo-enabled", + priv->halo_enabled, bool, out); + + GF_OPTION_INIT ("halo-failover-enabled", + priv->halo_failover_enabled, bool, out); + + GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32, + out); + + GF_OPTION_INIT ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, uint32, out); + + GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -445,17 +523,24 @@ init (xlator_t *this) priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up) { + + priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency), + child_count, + gf_afr_mt_child_latency_t); + + if (!priv->child_up || !priv->child_latency) { ret = -ENOMEM; goto out; } - for (i = 0; i < child_count; i++) + for (i = 0; i < child_count; i++) { + priv->child_latency[i] = 0.0; priv->child_up[i] = -1; /* start with unknown state. this initialization needed for afr_notify() to work reliably */ + } priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, gf_afr_mt_xlator_t); @@ -663,6 +748,85 @@ struct volume_options options[] = { "jobs that can perform parallel heals in the " "background." }, + { .key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "Maximum latency for shd halo replication in msec." + }, + { .key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable Halo (geo) replication mode." + }, + { .key = {"halo-failover-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable x-halo failover: will allow failover " + "to bricks outside the client or daemons' halo " + "in an attempt to satisfy halo-min-replicas." + }, + { .key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for nfsd halo replication in msec." + }, + { .key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for halo replication in msec." + }, + { .key = {"halo-hybrid-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable hybrid sync mounts. When enabled, halo will " + "do write FOPs synchronously, and read FOPs will be " + "services in-region if the inode is clean/consistent." + "If no bricks can be found below " + "halo-hybrid-max-read-latency then the best 2 shall " + "be selected. This option can be used in " + "conjunction with all other halo options." + }, + { .key = {"halo-hybrid-read-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "8", + .description = "Maximum latency hybrid mode will use to select " + "children for read FOPs. Don't tune this unless " + "you really know what you are doing (i.e. you've " + "read/understand the associated source code)." + }, + { .key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD." + }, + { .key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .description = "The minimum number of halo replicas, before adding " + "out of region replicas." + }, + { .key = {"halo-min-samples"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "3", + .description = "The minimum number of halo latency samples, before " + "we start forming the halos." + }, { .key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -803,6 +967,13 @@ struct volume_options options[] = { "translator is running as part of self-heal-daemon " "or not." }, + { .key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not." + }, { .key = {"quorum-type"}, .type = GF_OPTION_TYPE_STR, .value = { "none", "auto", "fixed"}, @@ -865,9 +1036,13 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, + { .key = {"gfid-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {"heal-timeout"}, .type = GF_OPTION_TYPE_INT, - .min = 60, + .min = 5, .max = INT_MAX, .default_value = "600", .description = "time interval for checking the need to self-heal " @@ -933,5 +1108,9 @@ struct volume_options options[] = { " with identical mtime and size in more than half the " "number of bricks in the replica.", }, + { .key = {"pgfid-self-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 70c3e349743..b61f6f67460 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,9 @@ #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) +#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */ +#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */ +#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ @@ -48,6 +51,8 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) #define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) #define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) +#define AFR_ROOT_GFID "00000000-0000-0000-0000-000000000001" +#define AFR_IS_ROOT_GFID(g) (strcmp (uuid_utoa(g), AFR_ROOT_GFID) == 0) #define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;}) #define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX)) @@ -72,6 +77,17 @@ typedef enum { AFR_FAV_CHILD_POLICY_MAX, } afr_favorite_child_policy; +struct afr_nfsd { + gf_boolean_t iamnfsd; + uint32_t halo_max_latency_msec; +}; + +struct afr_child { + uint32_t idx; + int64_t latency; + unsigned char child_up; +}; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -83,6 +99,8 @@ typedef struct _afr_private { inode_t *root_inode; unsigned char *child_up; + int64_t *child_latency; + gf_boolean_t pgfid_self_heal; unsigned char *local; char **pending_key; @@ -111,6 +129,7 @@ typedef struct _afr_private { gf_boolean_t entry_change_log; /* on/off */ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + gf_boolean_t gfid_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving @@ -148,13 +167,25 @@ typedef struct _afr_private { uint32_t event_generation; gf_boolean_t choose_local; + gf_boolean_t did_local_discovery; gf_boolean_t did_discovery; uint64_t sh_readdir_size; gf_boolean_t ensure_durability; char *sh_domain; char *afr_dirty; + gf_boolean_t halo_enabled; + + /* Halo geo-replication tunables */ + gf_boolean_t halo_failover_enabled; + gf_boolean_t halo_hybrid_mode; + uint32_t halo_hybrid_read_max_latency_msec; + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; + uint32_t halo_min_samples; - afr_self_heald_t shd; + afr_self_heald_t shd; + struct afr_nfsd nfsd; gf_boolean_t consistent_metadata; uint64_t spb_choice_timeout; @@ -787,6 +818,7 @@ typedef struct _afr_local { mode_t umask; int xflag; gf_boolean_t do_discovery; + gf_boolean_t do_local_discovery; struct afr_reply *replies; /* For client side background heals. */ @@ -795,6 +827,9 @@ typedef struct _afr_local { gf_boolean_t need_full_crawl; gf_boolean_t is_read_txn; + loc_t *unsplit_locs; /* Un-split targets */ + uuid_t heal_pgfid; /* pgfid of file being healed */ + char *heal_ancestry_path; /* Full path if avail */ } afr_local_t; diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/cluster/aha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am new file mode 100644 index 00000000000..006db127d28 --- /dev/null +++ b/xlators/cluster/aha/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = aha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +aha_la_LDFLAGS = -module -avoid-version + +aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c +aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c new file mode 100644 index 00000000000..3b2ca641de2 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.c @@ -0,0 +1,952 @@ +#include "aha-fops.h" + +static void +__save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + list_add_tail (&fop->list, &conf->failed); +} + +void +save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __save_fop (fop, conf); + } + UNLOCK (&conf->lock); +} + +#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...) \ + do { \ + struct aha_fop *fop = aha_fop_new (); \ + if (!fop) { \ + gf_log (GF_AHA, GF_LOG_CRITICAL, \ + "Allocation failed, terminating " \ + "to prevent a hung mount."); \ + assert (0); \ + } \ + fop->stub = fop_##type##_stub (frame, aha_##type, \ + args); \ + fop->frame = frame; \ + frame->local = fop; \ + STACK_WIND (frame, cbk, obj, fn, args); \ + } while (0) \ + +/* + * AHA_HANDLE_FOP_CBK + * + * 1) If the error returned is ENOTCONN *and* the timer that waits + * for the server to come back has not expired, store the fop to retry later. + * 2) If the timer waiting for the server has expired, just unwind. + * 3) If the error returned is something other than ENOTCONN, just unwind. + * + */ +#define AHA_HANDLE_FOP_CBK(type, frame, args ...) \ + do { \ + struct aha_conf *conf = frame->this->private; \ + struct aha_fop *fop = frame->local; \ + if (op_ret != 0 && op_errno == ENOTCONN && \ + !aha_is_timer_expired (conf)) { \ + gf_log (GF_AHA, GF_LOG_WARNING, \ + "Got ENOTCONN from client, storing " \ + "to retry later!"); \ + save_fop (fop, conf); \ + } else { \ + AHA_DESTROY_LOCAL (frame); \ + STACK_UNWIND_STRICT (type, frame, args); \ + } \ + } while (0) \ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode, + buf, xdata, postparent); + return 0; +} + + +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + loc, xdata); + return 0; +} + + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, stat, aha_stat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, + loc, xdata); + return 0; +} + + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} + + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t mask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, access, aha_access_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->access, + loc, mask, xdata); + return 0; +} + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno, + path, sbuf, xdata); + return 0; +} + + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + size_t size, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readlink, + loc, size, xdata); + return 0; +} + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; +} + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->unlink, + loc, xflag, xdata); + return 0; +} + + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rmdir, + loc, flags, xdata); + return 0; +} + + +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->symlink, + linkpath, loc, umask, xdata); + return 0; +} + + +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf, + preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + return 0; +} + + +int +aha_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rename, aha_rename_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, link, aha_link_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, create, aha_create_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + + +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, open, aha_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno, + vector, count, stbuf, iobref, xdata); + return 0; +} + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readv, aha_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, + off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, writev, aha_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + return 0; +} + + +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, flush, aha_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, + fd, xdata); + return 0; +} + + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, flags, xdata); + return 0; +} + + +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, + fd, xdata); + return 0; +} + + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->opendir, + loc, fd, xdata); + return 0; +} + +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsyncdir, + fd, flags, xdata); + return 0; +} + + +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->statfs, + loc, xdata); + return 0; +} + + + +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + + +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->getxattr, + loc, name, xdata); + return 0; +} + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->xattrop, + loc, flags, dict, xdata); + return 0; +} + + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fxattrop, + fd, flags, dict, xdata); + return 0; +} + + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata); + return 0; +} + + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lk, aha_lk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lk, + fd, cmd, lock, xdata); + return 0; +} + + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->inodelk, + volume, loc, cmd, lock, xdata); + return 0; +} + + +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->finodelk, + volume, fd, cmd, lock, xdata); + return 0; +} + + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + return 0; +} + + +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + return 0; +} + +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdir, + fd, size, off, xdata); + return 0; +} + + +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdirp, + fd, size, off, dict); + return 0; +} diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h new file mode 100644 index 00000000000..b1fb9d38a80 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.h @@ -0,0 +1,360 @@ +#ifndef _AHA_FOPS_H +#define _AHA_FOPS_H + +#include "aha.h" +#include "aha-helpers.h" + +/* FOP functions */ +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata); + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata); + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); + +int +aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata); + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata); + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata); + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, + dict_t *xdata); + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +/* Callback functions */ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent); + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata); + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata); +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata); + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata); +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata); +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata); +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); + +#endif /* _AHA_FOPS_H */ diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c new file mode 100644 index 00000000000..e3b713688d3 --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.c @@ -0,0 +1,46 @@ +#include "aha-helpers.h" + +struct aha_conf *aha_conf_new () +{ + struct aha_conf *conf = NULL; + + conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf); + if (!conf) + goto err; + + INIT_LIST_HEAD (&conf->failed); + + LOCK_INIT (&conf->lock); +err: + return conf; +} + +void aha_conf_destroy (struct aha_conf *conf) +{ + LOCK_DESTROY (&conf->lock); + GF_FREE (conf); +} + +struct aha_fop *aha_fop_new () +{ + struct aha_fop *fop = NULL; + + fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop); + if (!fop) + goto err; + + INIT_LIST_HEAD (&fop->list); + +err: + return fop; +} + +void aha_fop_destroy (struct aha_fop *fop) +{ + if (!fop) + return; + + call_stub_destroy (fop->stub); + fop->stub = NULL; + GF_FREE (fop); +} diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h new file mode 100644 index 00000000000..d9cf9b3295d --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.h @@ -0,0 +1,23 @@ +#ifndef _AHA_HELPERS_H +#define _AHA_HELPERS_H + +#include "aha.h" + +#define GF_AHA "aha" + +struct aha_conf *aha_conf_new (); + +void aha_conf_destroy (struct aha_conf *conf); + +struct aha_fop *aha_fop_new (); + +void aha_fop_destroy (struct aha_fop *fop); + +#define AHA_DESTROY_LOCAL(frame) \ + do { \ + struct aha_fop *fop = frame->local; \ + aha_fop_destroy (fop); \ + frame->local = NULL; \ + } while (0) \ + +#endif /* _AHA_HELPERS_H */ diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h new file mode 100644 index 00000000000..117dda27e8b --- /dev/null +++ b/xlators/cluster/aha/src/aha-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __AHA_MEM_TYPES_H__ +#define __AHA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_aha_mem_types_ { + gf_aha_mt_begin_t = gf_common_mt_end + 1, + gf_aha_mt_conf, + gf_aha_mt_fop, + gf_aha_mt_end +}; +#endif diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c new file mode 100644 index 00000000000..8810f913f42 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.c @@ -0,0 +1,524 @@ +#include "aha.h" +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" + +/* + * AHA_RETRY_FOP: + * + * - We STACK_WIND the fop using the arguments in the call_stub. + * We use STACK_WIND because we need a *new* frame, since we already + * exhausted the existing frame with the original STACK_WIND. + * + * - After STACK_WIND completes, we can destroy this frame's local (which + * should be struct aha_fop *). The frame itself will get destroyed higher in + * the xlator graph, since its still part of the call stack. + */ +#define AHA_RETRY_FOP(fop, type, args ...) \ + do { \ + call_stub_t *stub = fop->stub; \ + call_frame_t *frame = fop->frame; \ + xlator_t *this = frame->this; \ + STACK_WIND (frame, aha_##type##_cbk, this, \ + this->fops->type, args); \ + AHA_DESTROY_LOCAL (frame); \ + } while (0) \ + +#define AHA_UNWIND_FOP(fop, type) \ + do { \ + call_frame_t *frame = fop->frame; \ + AHA_DESTROY_LOCAL (frame); \ + default_##type##_failure_cbk (frame, ETIMEDOUT); \ + } while (0) \ + +void +__aha_retry_force_unwind_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_force_unwind_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Force-unwound %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + +void +aha_force_unwind_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_force_unwind_fops (conf); + } + UNLOCK (&conf->lock); +} + +void +__aha_retry_failed_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Skip if the child is not up + */ + if (!conf->child_up) { + gf_log (GF_AHA, GF_LOG_WARNING, + "Waiting for child to come up before retrying."); + return; + } + + /* + * Skip if the the queue is empty. + */ + if (list_empty (&conf->failed)) { + gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry."); + } + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_retry_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Drained %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + + +void +aha_retry_failed_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_failed_fops (conf); + } + UNLOCK (&conf->lock); +} + +void aha_retry_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_CREATE: + AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags, + stub->args.mode, stub->args.umask, + stub->args.fd, + stub->args.xdata); + break; + + case GF_FOP_STAT: + AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READLINK: + AHA_RETRY_FOP (fop, readlink, &stub->args.loc, + stub->args.size, stub->args.xdata); + break; + + case GF_FOP_MKNOD: + AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode, + stub->args.rdev, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_MKDIR: + AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode, + stub->args.umask, stub->args.xdata); + break; + + case GF_FOP_UNLINK: + AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag, + stub->args.xdata); + break; + + case GF_FOP_RMDIR: + AHA_RETRY_FOP (fop, rmdir, &stub->args.loc, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_SYMLINK: + AHA_RETRY_FOP (fop, symlink, stub->args.linkname, + &stub->args.loc, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_RENAME: + AHA_RETRY_FOP (fop, rename, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_LINK: + AHA_RETRY_FOP (fop, link, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_TRUNCATE: + AHA_RETRY_FOP (fop, truncate, &stub->args.loc, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READ: + AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_WRITE: + AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector, + stub->args.count, stub->args.offset, + stub->args.flags, stub->args.iobref, + stub->args.xdata); + break; + + case GF_FOP_STATFS: + AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_FLUSH: + AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNC: + AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync, + stub->args.xdata); + break; + + case GF_FOP_SETXATTR: + AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_GETXATTR: + AHA_RETRY_FOP (fop, getxattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FSETXATTR: + AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd, + stub->args.xattr, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_FGETXATTR: + AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_REMOVEXATTR: + AHA_RETRY_FOP (fop, removexattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_OPENDIR: + AHA_RETRY_FOP (fop, opendir, &stub->args.loc, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNCDIR: + AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd, + stub->args.datasync, stub->args.xdata); + break; + + case GF_FOP_ACCESS: + AHA_RETRY_FOP (fop, access, &stub->args.loc, + stub->args.mask, stub->args.xdata); + break; + + case GF_FOP_FTRUNCATE: + AHA_RETRY_FOP (fop, ftruncate, stub->args.fd, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_FSTAT: + AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_LK: + AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_INODELK: + AHA_RETRY_FOP (fop, inodelk, stub->args.volume, + &stub->args.loc, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_FINODELK: + AHA_RETRY_FOP (fop, finodelk, stub->args.volume, + stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_ENTRYLK: + AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_FENTRYLK: + AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_LOOKUP: + AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READDIR: + AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READDIRP: + AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_XATTROP: + AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_FXATTROP: + AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_SETATTR: + AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + case GF_FOP_FSETATTR: + AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + default: + /* Some fops are not implemented yet: + * + * GF_FOP_NULL + * GF_FOP_RCHECKSUM + * GF_FOP_FORGET + * GF_FOP_RELEASE + * GF_FOP_RELEASEDIR + * GF_FOP_GETSPEC + * GF_FOP_FALLOCATE + * GF_FOP_DISCARD + * GF_FOP_ZEROFILL + * GF_FOP_MAXVALUE + * + */ + gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s", + gf_fop_list[stub->fop]); + assert (0); + break; + } +} + +void +aha_force_unwind_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_UNWIND_FOP (fop, open); + break; + + case GF_FOP_CREATE: + AHA_UNWIND_FOP (fop, create); + break; + + case GF_FOP_STAT: + AHA_UNWIND_FOP (fop, stat); + break; + + case GF_FOP_READLINK: + AHA_UNWIND_FOP (fop, readlink); + break; + + case GF_FOP_MKNOD: + AHA_UNWIND_FOP (fop, mknod); + break; + + case GF_FOP_MKDIR: + AHA_UNWIND_FOP (fop, mkdir); + break; + + case GF_FOP_UNLINK: + AHA_UNWIND_FOP (fop, unlink); + break; + + case GF_FOP_RMDIR: + AHA_UNWIND_FOP (fop, rmdir); + break; + + case GF_FOP_SYMLINK: + AHA_UNWIND_FOP (fop, symlink); + break; + + case GF_FOP_RENAME: + AHA_UNWIND_FOP (fop, rename); + break; + + case GF_FOP_LINK: + AHA_UNWIND_FOP (fop, link); + break; + + case GF_FOP_TRUNCATE: + AHA_UNWIND_FOP (fop, truncate); + break; + + case GF_FOP_READ: + AHA_UNWIND_FOP (fop, readv); + break; + + case GF_FOP_WRITE: + AHA_UNWIND_FOP (fop, writev); + break; + + case GF_FOP_STATFS: + AHA_UNWIND_FOP (fop, statfs); + break; + + case GF_FOP_FLUSH: + AHA_UNWIND_FOP (fop, flush); + break; + + case GF_FOP_FSYNC: + AHA_UNWIND_FOP (fop, fsync); + break; + + case GF_FOP_SETXATTR: + AHA_UNWIND_FOP (fop, setxattr); + break; + + case GF_FOP_GETXATTR: + AHA_UNWIND_FOP (fop, getxattr); + break; + + case GF_FOP_FSETXATTR: + AHA_UNWIND_FOP (fop, fsetxattr); + break; + + case GF_FOP_FGETXATTR: + AHA_UNWIND_FOP (fop, fgetxattr); + break; + + case GF_FOP_REMOVEXATTR: + AHA_UNWIND_FOP (fop, removexattr); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_UNWIND_FOP (fop, fremovexattr); + break; + + case GF_FOP_OPENDIR: + AHA_UNWIND_FOP (fop, opendir); + break; + + case GF_FOP_FSYNCDIR: + AHA_UNWIND_FOP (fop, fsyncdir); + break; + + case GF_FOP_ACCESS: + AHA_UNWIND_FOP (fop, access); + break; + + case GF_FOP_FTRUNCATE: + AHA_UNWIND_FOP (fop, ftruncate); + break; + + case GF_FOP_FSTAT: + AHA_UNWIND_FOP (fop, fstat); + break; + + case GF_FOP_LK: + AHA_UNWIND_FOP (fop, lk); + break; + + case GF_FOP_INODELK: + AHA_UNWIND_FOP (fop, inodelk); + break; + + case GF_FOP_FINODELK: + AHA_UNWIND_FOP (fop, finodelk); + break; + + case GF_FOP_ENTRYLK: + AHA_UNWIND_FOP (fop, entrylk); + break; + + case GF_FOP_FENTRYLK: + AHA_UNWIND_FOP (fop, fentrylk); + break; + + case GF_FOP_LOOKUP: + AHA_UNWIND_FOP (fop, lookup); + break; + + case GF_FOP_READDIR: + AHA_UNWIND_FOP (fop, readdir); + break; + + case GF_FOP_READDIRP: + AHA_UNWIND_FOP (fop, readdirp); + break; + + case GF_FOP_XATTROP: + AHA_UNWIND_FOP (fop, xattrop); + break; + + case GF_FOP_FXATTROP: + AHA_UNWIND_FOP (fop, fxattrop); + break; + + case GF_FOP_SETATTR: + AHA_UNWIND_FOP (fop, setattr); + break; + + case GF_FOP_FSETATTR: + AHA_UNWIND_FOP (fop, fsetattr); + break; + + default: + /* Some fops are not implemented yet, + * and this would never happen cause we wouldn't + * queue them (see the assert statement in aha_retry_fop()) + */ + break; + } +} diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h new file mode 100644 index 00000000000..5c8f56bca97 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.h @@ -0,0 +1,12 @@ +#ifndef _AHA_RETRY_H +#define _AHA_RETRY_H + +void aha_retry_failed_fops (struct aha_conf *conf); + +void aha_retry_fop (struct aha_fop *fop); + +void aha_force_unwind_fops (struct aha_conf *conf); + +void aha_force_unwind_fop (struct aha_fop *fop); + +#endif /* _AHA_RETRY_H */ diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c new file mode 100644 index 00000000000..5160f1091d4 --- /dev/null +++ b/xlators/cluster/aha/src/aha.c @@ -0,0 +1,345 @@ +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" +#include "aha.h" + +#include "syncop.h" + + +int +retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg) +{ + /* Nothing to do here ... */ + return 0; +} + +int +retry_failed_fops (void *arg) +{ + xlator_t *this = NULL; + + struct aha_conf *conf = NULL; + + this = arg; + conf = this->private; + + aha_retry_failed_fops (conf); + + return 0; +} + +void +dispatch_fop_queue_drain (xlator_t *this) +{ + struct syncenv *env = NULL; + int ret = 0; + + env = this->ctx->env; + + ret = synctask_new (env, retry_failed_fops, + retry_failed_fops_cbk, NULL, this); + if (ret != 0) { + gf_log (GF_AHA, GF_LOG_CRITICAL, + "Failed to dispatch synctask " + "to drain fop queue!"); + } +} + +extern inline void +__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired) +{ + conf->timer_expired = expired; +} + +extern inline gf_boolean_t +__aha_is_timer_expired (struct aha_conf *conf) +{ + return conf->timer_expired; +} + +gf_boolean_t +aha_is_timer_expired (struct aha_conf *conf) +{ + gf_boolean_t expired = _gf_false; + + LOCK (&conf->lock); + { + expired = __aha_is_timer_expired (conf); + } + UNLOCK (&conf->lock); + + return expired; +} + +void +aha_child_down_timer_expired (void *data) +{ + struct aha_conf *conf = NULL; + + conf = data; + + gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!"); + + LOCK (&conf->lock); + { + __aha_set_timer_status (conf, _gf_true); + } + UNLOCK (&conf->lock); + + aha_force_unwind_fops ((struct aha_conf *)data); +} + +void +__aha_start_timer (struct aha_conf *conf) +{ + struct timespec child_down_timeout = { + .tv_sec = conf->server_wait_timeout, + .tv_nsec = 0 + }; + + __aha_set_timer_status (conf, _gf_false); + + conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout, + aha_child_down_timer_expired, conf); + if (!conf->timer) { + gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!"); + } + + gf_log (GF_AHA, GF_LOG_INFO, + "Registered timer for %lu seconds.", + conf->server_wait_timeout); +} + +void +__aha_cancel_timer (struct aha_conf *conf) +{ + if (!conf->timer) + goto out; + + gf_timer_call_cancel (conf->this->ctx, conf->timer); + conf->timer = NULL; + gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!"); +out: + return; +} + +void +__aha_update_child_status (struct aha_conf *conf, int status) +{ + conf->child_up = status; +} + +void +aha_handle_child_up (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status ( + conf, AHA_CHILD_STATUS_UP); /* Mark the child as up */ + __aha_set_timer_status ( + conf, _gf_false); /* Timer is no longer expired */ + __aha_cancel_timer (conf); /* Cancel the timer */ + } + UNLOCK (&conf->lock); +} + +void +aha_handle_child_down (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN); + __aha_set_timer_status (conf, _gf_true); + __aha_start_timer (conf); + } + UNLOCK (&conf->lock); +} + +int32_t +notify (xlator_t *this, int32_t event, void *data, ...) +{ + switch (event) { + case GF_EVENT_CHILD_DOWN: + gf_log (this->name, GF_LOG_WARNING, "Got child-down event!"); + aha_handle_child_down (this); + break; + case GF_EVENT_CHILD_UP: + gf_log (this->name, GF_LOG_WARNING, "Got child-up event!"); + aha_handle_child_up (this); + dispatch_fop_queue_drain (this); + break; + default: + break; + } + + default_notify (this, event, data); + + return 0; +} + +int32_t +aha_priv_dump (xlator_t *this) +{ + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init failed!"); + return ret; + } + + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF ("server-wait-timeout-seconds", + conf->server_wait_timeout, + options, size_uint64, err); + + return 0; +err: + return -1; +} + +int +aha_init_options (xlator_t *this) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_INIT ("server-wait-timeout-seconds", + conf->server_wait_timeout, + size_uint64, err); + + return 0; +err: + return -1; +} + + +int +init (xlator_t *this) +{ + int ret = 0; + struct aha_conf *conf = NULL; + + conf = aha_conf_new (); + if (!conf) { + ret = -(ENOMEM); + goto err; + } + + conf->this = this; + this->private = conf; + + aha_init_options (this); + + /* init() completed successfully */ + goto done; +err: + gf_log (GF_AHA, GF_LOG_ERROR, + "init() failed, please see " + "logs for details."); + + /* Free all allocated memory */ + aha_conf_destroy (conf); +done: + return ret; +} + +void +fini (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + aha_conf_destroy (conf); + + this->private = NULL; +} + +struct xlator_dumpops dumpops = { + .priv = aha_priv_dump, +}; + +struct xlator_fops cbks; + +struct xlator_fops fops = { + .lookup = aha_lookup, + .stat = aha_stat, + .readlink = aha_readlink, + .mknod = aha_mknod, + .mkdir = aha_mkdir, + .unlink = aha_unlink, + .rmdir = aha_rmdir, + .symlink = aha_symlink, + .rename = aha_rename, + .link = aha_link, + .truncate = aha_truncate, + .create = aha_create, + .open = aha_open, + .readv = aha_readv, + .writev = aha_writev, + .statfs = aha_statfs, + .flush = aha_flush, + .fsync = aha_fsync, + .setxattr = aha_setxattr, + .getxattr = aha_getxattr, + .removexattr = aha_removexattr, + .fsetxattr = aha_fsetxattr, + .fgetxattr = aha_fgetxattr, + .fremovexattr = aha_fremovexattr, + .opendir = aha_opendir, + .readdir = aha_readdir, + .readdirp = aha_readdirp, + .fsyncdir = aha_fsyncdir, + .access = aha_access, + .ftruncate = aha_ftruncate, + .fstat = aha_fstat, + .lk = aha_lk, + .lookup_cbk = aha_lookup_cbk, + .xattrop = aha_xattrop, + .fxattrop = aha_fxattrop, + .inodelk = aha_inodelk, + .finodelk = aha_finodelk, + .entrylk = aha_entrylk, + .fentrylk = aha_fentrylk, + .setattr = aha_setattr, + .fsetattr = aha_fsetattr, +}; + +struct volume_options options[] = { + { .key = {"server-wait-timeout-seconds"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 10, + .max = 20 * 60, + .default_value = TOSTRING (120), + .description = "Specifies the number of seconds the " + "AHA translator will wait " + "for a CHILD_UP event before " + "force-unwinding the frames it has " + "currently stored for retry." + }, + { .key = {NULL} } +}; diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h new file mode 100644 index 00000000000..3dbf3199776 --- /dev/null +++ b/xlators/cluster/aha/src/aha.h @@ -0,0 +1,46 @@ +#ifndef _AHA_H +#define _AHA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "statedump.h" +#include "call-stub.h" +#include "defaults.h" +#include "list.h" +#include "timer.h" + +#include "aha-mem-types.h" + +/* new() and destroy() functions for all structs can be found in + * aha-helpers.c + */ +struct aha_conf { + xlator_t *this; + uint8_t child_up; + gf_lock_t lock; + struct list_head failed; + gf_timer_t *timer; + gf_boolean_t timer_expired; + uint64_t server_wait_timeout; +}; + +struct aha_fop { + call_stub_t *stub; /* Only used to store function arguments */ + call_frame_t *frame; /* Frame corresponding to this fop */ + uint64_t tries; + struct list_head list; +}; + +enum { + AHA_CHILD_STATUS_DOWN = 0, + AHA_CHILD_STATUS_UP = 1, + AHA_CHILD_STATUS_MAX +}; + +gf_boolean_t aha_is_timer_expired (struct aha_conf *conf); + +#endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index c4586c2f9b1..cd35080e243 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3463,6 +3463,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, int cnt = 0; char *node_uuid_key = NULL; int ret = -1; + + GF_CHECK_XATTR_KEY_AND_GOTO (key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err); VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); @@ -5553,6 +5555,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -5565,9 +5568,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, subvol, subvol->fops->mknod, loc, mode, rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + /* This will return NULL if all subvolumes are full + * and/or no subvolume needs the min_free_disk limit + */ + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->rdev = rdev; local->mode = mode; @@ -5597,6 +5606,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + return op_errno; } int32_t @@ -6165,7 +6176,7 @@ dht_mknod (call_frame_t *frame, xlator_t *this, gf_msg_debug (this->name, 0, "no subvolume in layout for path=%s", loc->path); - op_errno = EIO; + op_errno = NO_SUBVOL_HASH_ERRNO; goto err; } @@ -6236,8 +6247,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this, } } - dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode, - umask, params); + op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, + rdev, mode, umask, + params); + if (op_errno != 0) { + goto err; + } done: return 0; @@ -6571,7 +6586,7 @@ dht_link (call_frame_t *frame, xlator_t *this, gf_msg_debug (this->name, 0, "no subvolume in layout for path=%s", newloc->path); - op_errno = EIO; + op_errno = NO_SUBVOL_HASH_ERRNO; goto err; } @@ -6734,6 +6749,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -6748,8 +6764,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } else { avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; local->mode = mode; @@ -6776,6 +6794,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return op_errno; } int @@ -6878,9 +6900,10 @@ dht_create_do (call_frame_t *frame) goto err; } - dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, - local->flags, local->mode, - local->umask, local->fd, local->params); + dht_create_wind_to_avail_subvol (frame, this, subvol, + &local->loc, local->flags, + local->mode, local->umask, + local->fd, local->params); return 0; err: local->refresh_layout_unlock (frame, this, -1, 1); @@ -7067,7 +7090,7 @@ dht_create (call_frame_t *frame, xlator_t *this, "no subvolume in layout for path=%s", loc->path); - op_errno = EIO; + op_errno = NO_SUBVOL_HASH_ERRNO; goto err; } @@ -7590,7 +7613,7 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, gf_msg_debug (this->name, 0, "hashed subvol not found for %s", loc->path); - local->op_errno = EIO; + local->op_errno = NO_SUBVOL_HASH_ERRNO; goto err; } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9e9ca712417..fa973f294fb 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -300,6 +300,7 @@ struct dht_du { uint64_t avail_space; uint32_t log; uint32_t chunks; + gf_boolean_t is_full; }; typedef struct dht_du dht_du_t; @@ -484,6 +485,7 @@ struct dht_conf { dht_du_t *du_stats; double min_free_disk; double min_free_inodes; + gf_boolean_t min_free_strict_mode; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -549,6 +551,10 @@ struct dht_conf { gf_boolean_t lock_migration_enabled; gf_lock_t lock; + + /* du stats */ + uint32_t du_refresh_interval_sec; + gf_lock_t du_refresh_lock; }; typedef struct dht_conf dht_conf_t; @@ -603,6 +609,8 @@ typedef struct dht_fd_ctx { } dht_fd_ctx_t; +#define NO_SUBVOL_HASH_ERRNO EROFS + #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) #define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 1eb9e63c531..1b20dabc61f 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; struct timeval tv = {0,}; + struct timeval cmp_tv = {0,}; loc_t tmp_loc = {0,}; conf = this->private; + /* Somebody else is already refreshing the statfs info */ + if (TRY_LOCK (&conf->du_refresh_lock) != 0) + return 0; + gettimeofday (&tv, NULL); + cmp_tv = conf->last_stat_fetch; + cmp_tv.tv_sec += conf->du_refresh_interval_sec; + /* make it root gfid, should be enough to get the proper info back */ tmp_loc.gfid[15] = 1; - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - + if (timercmp (&tv, &cmp_tv, >)) { statfs_frame = copy_frame (frame); if (!statfs_frame) { goto err; @@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) &tmp_loc, statfs_local->params); } - conf->last_stat_fetch.tv_sec = tv.tv_sec; + conf->last_stat_fetch = tv; } - return 0; + ret = 0; + goto out; err: if (statfs_frame) DHT_STACK_DESTROY (statfs_frame); - return -1; + ret = -1; +out: + UNLOCK (&conf->du_refresh_lock); + return ret; } @@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) conf = this->private; /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { + if (TRY_LOCK (&conf->subvolume_lock) != 0) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + return conf->du_stats[i].is_full; + } + } + } else { for (i = 0; i < conf->subvolume_cnt; i++) { if (subvol == conf->subvolumes[i]) { if (conf->disk_unit == 'p') { @@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } } - } + + /* i will be less than subvolume_cnt if either of + * these booleans are true */ + is_subvol_filled = ( + subvol_filled_space || subvol_filled_inodes); + if (is_subvol_filled) { + conf->du_stats[i].is_full = is_subvol_filled; + } + } UNLOCK (&conf->subvolume_lock); if (subvol_filled_space && conf->subvolume_status[i]) { @@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } - is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); - return is_subvol_filled; } @@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol, layout); - if(!avail_subvol) - { - avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol, - layout); - } - } UNLOCK (&conf->subvolume_lock); out: @@ -325,7 +339,6 @@ out: gf_msg_debug (this->name, 0, "No subvolume has enough free space \ and/or inodes to create"); - avail_subvol = subvol; } if (layout) diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 298eca711b4..05f71fbcc86 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, loc); local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); if (!local) { @@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true && + flags & O_APPEND) { + op_errno = ENOSPC; + goto err; } if (xdata) local->xattr_req = dict_ref (xdata); diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 364b66c942e..48d49dd3475 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + loc_t *nil_loc = {0,}; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); if (!local) { @@ -173,12 +178,19 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, nil_loc); + subvol = local->cached_subvol; if (!subvol) { gf_msg_debug (this->name, 0, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true) { + op_errno = ENOSPC; + goto err; } if (xdata) local->xattr_req = dict_ref (xdata); diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a478f06b2a9..dc0b7dd619e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -20,7 +20,7 @@ #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ -#define DHT_REBALANCE_BLKSIZE (128 * 1024) +#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */ #define MAX_MIGRATE_QUEUE_COUNT 500 #define MIN_MIGRATE_QUEUE_COUNT 200 @@ -1328,14 +1328,25 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, &dst_fd, xattr); - if (ret) - goto out; + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: " + "failed to create dest file on %s", + loc->path, to->name); + goto out; + } clean_dst = _gf_true; ret = __dht_check_free_space (to, from, loc, &stbuf, flag); if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: " + "Disk space check failed on %s", + loc->path, to->name); goto out; } @@ -1345,7 +1356,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, - "Migrate file failed: failed to open %s on %s", + "Migrate file failed: %s: failed to open on %s", loc->path, from->name); goto out; } @@ -1360,7 +1371,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (ret) { gf_msg (this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, - "Migrate file failed:failed to lookup %s on %s ", + "Migrate file failed: %s: failed to lookup %s ", loc->path, from->name); ret = -1; goto out; @@ -2427,6 +2438,9 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container, goto out; } + gf_uuid_copy (entry_loc.inode->gfid, + df_entry->d_stat.ia_gfid); + if (gf_uuid_is_null (df_entry->d_stat.ia_gfid)) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_NULL, diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 5c810f0dc77..ccbf66b626d 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options) conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; + GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode, + options, bool, out); GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, percent, out); @@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options, bool, out); + + GF_OPTION_RECONF ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, options, uint32, out); ret = 0; out: return ret; @@ -720,7 +725,10 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + err); + + GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode, + bool, err); GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); @@ -738,6 +746,11 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled, bool, err); + GF_OPTION_INIT ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, uint32, err); + + LOCK_INIT (&conf->du_refresh_lock); + if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -907,6 +920,14 @@ struct volume_options options[] = { "process starts balancing out the cluster, and logs will appear " "in log files", }, + { .key = {"min-free-strict-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "When enabled, will reject in-flight writes or " + "append operations to files when the target subvolume falls " + "below min-free-(disk|inodes). When disabled, these are allowed " + "through and only new files will be affected.", + }, { .key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", @@ -1089,5 +1110,14 @@ struct volume_options options[] = { " associated with a file during rebalance" }, + { .key = {"du-refresh-interval-sec"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "60", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies how many seconds before subvolume statfs " + "info is re-validated." + }, + { .key = {NULL} }, }; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 56e17d6e884..996faffa37f 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->params = dict_ref (params); local->mode = mode; @@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index f1e9a399442..8b14ac99b8f 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->mode = mode; local->flags = flags; @@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index c21417a0192..0b5c095c3b4 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -35,6 +35,7 @@ #include "logging.h" #include "cli1-xdr.h" #include "statedump.h" +#include "syncop.h" #include <pwd.h> #include <grp.h> @@ -91,9 +92,13 @@ typedef struct _ios_sample_t { uid_t uid; gid_t gid; char identifier[UNIX_PATH_MAX]; + char path[UNIX_PATH_MAX]; glusterfs_fop_t fop_type; struct timeval timestamp; double elapsed; + gf_boolean_t have_path; + int32_t op_ret; + int32_t op_errno; } ios_sample_t; @@ -178,10 +183,33 @@ typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*, int , int , uint64_t ) ; struct ios_local { - struct timeval wind_at; - struct timeval unwind_at; + inode_t *inode; + loc_t loc; + fd_t *fd; }; +static struct ios_local * +ios_local_new() { + return GF_CALLOC (1, sizeof (struct ios_local), + gf_common_mt_char); +} + +static void +ios_local_free (struct ios_local *local) +{ + if (!local) + return; + + inode_unref (local->inode); + + if (local->fd) + fd_unref (local->fd); + + loc_wipe (&local->loc); + memset (local, 0, sizeof (*local)); + GF_FREE (local); +} + struct volume_options options[]; static int @@ -192,6 +220,57 @@ is_fop_latency_started (call_frame_t *frame) return memcmp (&frame->begin, &epoch, sizeof (epoch)); } +static void +ios_free_local (call_frame_t *frame) +{ + struct ios_local *local = frame->local; + + ios_local_free (local); + + frame->local = NULL; +} + +static void +ios_track_loc (call_frame_t *frame, loc_t *loc) +{ + struct ios_local *local = NULL; + + if (loc && loc->path) { + /* Check if frame->local is already set (it should + * only be set by either ios_track_loc() or + * ios_track_fd()). In other words, this check + * allows us to chain calls to ios_track_loc() + * and ios_track_fd() without clobbering frame->local + * in the process. + */ + if (frame->local) { + local = frame->local; + } else { + local = ios_local_new (); + } + loc_copy (&local->loc, loc); + frame->local = local; + } +} + +static void +ios_track_fd (call_frame_t *frame, fd_t *fd) +{ + struct ios_local *local = NULL; + + if (fd && fd->inode) { + if (frame->local) { + local = frame->local; + } else { + local = ios_local_new (); + } + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + frame->local = local; + } +} + + #define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples" #ifdef GF_LINUX_HOST_OS #define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats" @@ -206,7 +285,7 @@ is_fop_latency_started (call_frame_t *frame) conf = this->private; \ if (conf && conf->measure_latency) { \ gettimeofday (&frame->end, NULL); \ - update_ios_latency (conf, frame, GF_FOP_##op); \ + update_ios_latency (conf, frame, GF_FOP_##op, 0, 0); \ } \ } while (0) @@ -244,7 +323,7 @@ is_fop_latency_started (call_frame_t *frame) #define STATS_ADD(x,i) (x) += (i) #endif -#define UPDATE_PROFILE_STATS(frame, op) \ +#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno) \ do { \ struct ios_conf *conf = NULL; \ \ @@ -257,7 +336,8 @@ is_fop_latency_started (call_frame_t *frame) conf->count_fop_hits) { \ BUMP_FOP(op); \ gettimeofday (&frame->end, NULL); \ - update_ios_latency (conf, frame, GF_FOP_##op);\ + update_ios_latency (conf, frame, GF_FOP_##op, \ + op_ret, op_errno); \ } \ } \ STATS_UNLOCK (&conf->lock); \ @@ -647,7 +727,7 @@ ios_stats_cleanup (xlator_t *this, inode_t *inode) fprintf (logfp, fmt); \ fprintf (logfp, "\n"); \ } \ - gf_log (this->name, GF_LOG_DEBUG, fmt); \ + gf_log (this->name, GF_LOG_TRACE, fmt); \ } while (0) int @@ -694,7 +774,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this, int _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { - char *key_root = "gluster"; + char *key_root = "storage.gluster"; char *xlator_name = NULL; char *instance_name = NULL; size_t key_len = 0; @@ -719,7 +799,7 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { } if (strcmp (__progname, "glusterfsd") == 0) - key_root = "gluster.brick"; + key_root = "storage.gluster.brick"; if (instance_name) { /* +3 for 2 x "." + NULL */ @@ -779,6 +859,7 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, float fop_lat_min; float fop_lat_max; double interval_sec; + loc_t unused_loc = {0, }; interval_sec = ((now->tv_sec * 1000000.0 + now->tv_usec) - (stats->started_at.tv_sec * 1000000.0 + @@ -883,6 +964,29 @@ io_stats_dump_global_to_json_logfp (xlator_t *this, "\"%s.%s.fop.%s.latency_max_usec\": \"%0.2lf\",", key_prefix, str_prefix, lc_fop_name, fop_lat_max); } + + dict_t *xattr = NULL; + ret = syncop_getxattr (this, &unused_loc, &xattr, + IO_THREADS_QUEUE_SIZE_KEY, NULL, NULL); + if (xattr) { + // Iterate over the dictionary returned to us by io-threads and + // dump the results to the stats file. + data_pair_t *curr = NULL; + dict_for_each (xattr, curr) { + ios_log (this, logfp, + "\"%s.%s.%s.queue_size\": \"%d\",", + key_prefix, str_prefix, curr->key, + data_to_int32 (curr->value)); + } + + // Free the dictionary + dict_unref (xattr); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Unable to get queue size counts from " + "the io-threads translator!"); + } + if (interval == -1) { ios_log (this, logfp, "\"%s.%s.uptime\": \"%"PRId64"\",", key_prefix, str_prefix, @@ -1010,7 +1114,10 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, char *port_pos = NULL; char *group_name = NULL; char *username = NULL; + char *path = NULL; struct ios_conf *conf = NULL; + const char *error_string = NULL; + int32_t op_errno = 0; conf = this->private; @@ -1057,12 +1164,22 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample, sprintf (group_name, "%d", (int32_t)sample->gid); } + path = "Unknown"; + if (sample->have_path) + path = sample->path; + + error_string = "No Error"; + if (sample->op_ret != 0) { + op_errno = abs (sample->op_errno); + error_string = strerror (op_errno); + } + ios_log (this, logfp, - "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s", + "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s", epoch_time, fop_enum_to_pri_string (sample->fop_type), fop_enum_to_string (sample->fop_type), sample->elapsed, xlator_name, instance_name, username, - group_name, hostname, port); + group_name, hostname, port, path, op_errno, error_string); goto out; err: gf_log (this->name, GF_LOG_ERROR, @@ -1608,14 +1725,87 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd) return 0; } +void ios_local_get_inode (struct ios_local *local, inode_t **inode) +{ + if (!local) + return; + + /* In the cases that a loc is given to us, + * we should use that as the source of truth + * for the inode. + */ + if (local->loc.inode) { + *inode = local->loc.inode; + return; + } + + /* Fall back to the inode in the local struct, + * but there is no guarantee this will be a valid + * pointer. + */ + *inode = local->inode; +} + +void ios_local_get_path (call_frame_t *frame, const char **path) +{ + struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; + inode_t *inode = NULL; + + local = frame->local; + if (!local) + goto out; + + ios_local_get_inode (local, &inode); + + if (inode) { + /* Each inode shold have an iosstat struct attached to it. + * This is the preferred way to retrieve the path. + */ + ios_inode_ctx_get (inode, frame->this, &iosstat); + if (iosstat) { + gf_log ("io-stats", GF_LOG_DEBUG, + "[%s] Getting path from iostat struct", + fop_enum_to_string (frame->op)); + *path = iosstat->filename; + goto out; + } + } + + /* If we don't have the iosstat attached to the inode, + * fall back to retrieving the path via the loc struct + * inside the local. + */ + if (local->loc.path) { + gf_log ("io-stats", GF_LOG_DEBUG, + "[%s] Getting path from loc_t", + fop_enum_to_string (frame->op)); + *path = local->loc.path; + goto out; + } + +out: + /* If the inode and the loc don't have the path, we're out of luck. + */ + if (!*path) { + gf_log ("io-stats", GF_LOG_DEBUG, + "Unable to get path for fop: %s", + fop_enum_to_string (frame->op)); + } + + return; +} + void collect_ios_latency_sample (struct ios_conf *conf, glusterfs_fop_t fop_type, double elapsed, - call_frame_t *frame) + call_frame_t *frame, int32_t op_ret, int32_t op_errno) { + struct ios_local *ios_local = NULL; ios_sample_buf_t *ios_sample_buf = NULL; ios_sample_t *ios_sample = NULL; struct timeval *timestamp = NULL; call_stack_t *root = NULL; + const char *path = NULL; ios_sample_buf = conf->ios_sample_buf; @@ -1630,6 +1820,8 @@ void collect_ios_latency_sample (struct ios_conf *conf, ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]); ios_sample->elapsed = elapsed; ios_sample->fop_type = fop_type; + ios_sample->op_ret = op_ret; + ios_sample->op_errno = op_errno; ios_sample->uid = root->uid; ios_sample->gid = root->gid; (ios_sample->timestamp).tv_sec = timestamp->tv_sec; @@ -1637,6 +1829,52 @@ void collect_ios_latency_sample (struct ios_conf *conf, memcpy (&ios_sample->identifier, &root->identifier, sizeof (root->identifier)); + /* Eventually every FOP will be supported + * (i.e., the frame->local will be + * of type struct ios_local), but for now, this is a safety. + */ + switch (ios_sample->fop_type) { + + case GF_FOP_CREATE: + case GF_FOP_OPEN: + case GF_FOP_STAT: + case GF_FOP_FSTAT: + case GF_FOP_READ: + case GF_FOP_WRITE: + case GF_FOP_OPENDIR: + case GF_FOP_READDIRP: + case GF_FOP_READDIR: + case GF_FOP_FLUSH: + case GF_FOP_ACCESS: + case GF_FOP_UNLINK: + case GF_FOP_TRUNCATE: + case GF_FOP_MKDIR: + case GF_FOP_RMDIR: + case GF_FOP_SETATTR: + case GF_FOP_LOOKUP: + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + case GF_FOP_ENTRYLK: + case GF_FOP_FXATTROP: + case GF_FOP_XATTROP: + case GF_FOP_GETXATTR: + case GF_FOP_FGETXATTR: + case GF_FOP_SETXATTR: + case GF_FOP_FSETXATTR: + case GF_FOP_STATFS: + case GF_FOP_FSYNC: + ios_local_get_path (frame, &path); + break; + default: + path = NULL; + break; + } + + if (path) { + strncpy (ios_sample->path, path, sizeof (ios_sample->path)); + ios_sample->have_path = _gf_true; + } + /* We've reached the end of the circular buffer, start from the * beginning. */ if (ios_sample_buf->pos == (ios_sample_buf->size - 1)) @@ -1674,7 +1912,7 @@ update_ios_latency_stats (struct ios_global_stats *stats, double elapsed, int update_ios_latency (struct ios_conf *conf, call_frame_t *frame, - glusterfs_fop_t op) + glusterfs_fop_t op, int32_t op_ret, int32_t op_errno) { double elapsed; struct timeval *begin, *end; @@ -1687,7 +1925,7 @@ update_ios_latency (struct ios_conf *conf, call_frame_t *frame, update_ios_latency_stats (&conf->cumulative, elapsed, op); update_ios_latency_stats (&conf->incremental, elapsed, op); - collect_ios_latency_sample (conf, op, elapsed, frame); + collect_ios_latency_sample (conf, op, elapsed, frame, op_ret, op_errno); return 0; } @@ -1811,40 +2049,100 @@ unlock_list_head: return ret; } +static int +attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, + const uuid_t gfid) { + struct ios_stat *iosstat = NULL; + + if (!inode) { + return -EINVAL; + } + + ios_inode_ctx_get (inode, this, &iosstat); + if (!iosstat) { + iosstat = GF_CALLOC (1, sizeof (*iosstat), + gf_io_stats_mt_ios_stat); + if (!iosstat) { + return -ENOMEM; + } + iosstat->filename = gf_strdup (path); + gf_uuid_copy (iosstat->gfid, gfid); + LOCK_INIT (&iosstat->lock); + ios_inode_ctx_set (inode, this, iosstat); + } + + return 0; +} + + +int +ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd) +{ + struct ios_fd *ifd = NULL; + int ret = 0; + + ifd = GF_CALLOC (1, sizeof (*ifd), gf_io_stats_mt_ios_fd); + if (!ifd) { + ret = -ENOMEM; + goto free_and_out; + } + + if (path) { + ifd->filename = gf_strdup (path); + if (!ifd->filename) { + ret = -ENOMEM; + goto free_and_out; + } + } + + gettimeofday (&ifd->opened_at, NULL); + + if (fd) + ios_fd_ctx_set (fd, this, ifd); + + *iosfd = ifd; + + return ret; + + /* Failure path */ +free_and_out: + if (ifd) { + GF_FREE (ifd->filename); + GF_FREE (ifd); + } + + *iosfd = NULL; + + return ret; +} + + int io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - struct ios_fd *iosfd = NULL; - char *path = NULL; - struct ios_stat *iosstat = NULL; - struct ios_conf *conf = NULL; - - conf = this->private; + struct ios_local *local = NULL; + struct ios_conf *conf = NULL; + struct ios_fd *iosfd = NULL; - path = frame->local; - frame->local = NULL; - - if (!path) + if (op_ret < 0) { goto unwind; + } - if (op_ret < 0) { - GF_FREE (path); + local = frame->local; + if (!local) { goto unwind; } - iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); + conf = this->private; + + ios_build_fd (this, local->loc.path, fd, &iosfd); if (!iosfd) { - GF_FREE (path); goto unwind; } - iosfd->filename = path; - gettimeofday (&iosfd->opened_at, NULL); - - ios_fd_ctx_set (fd, this, iosfd); LOCK (&conf->lock); { conf->cumulative.nr_opens++; @@ -1855,18 +2153,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&conf->lock); - iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); - if (!iosstat) { - GF_FREE (path); - goto unwind; - } - iosstat->filename = gf_strdup (path); - gf_uuid_copy (iosstat->gfid, buf->ia_gfid); - LOCK_INIT (&iosstat->lock); - ios_inode_ctx_set (fd->inode, this, iosstat); + attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, + buf->ia_gfid); unwind: - UPDATE_PROFILE_STATS (frame, CREATE); + UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, preparent, postparent, xdata); return 0; @@ -1877,44 +2169,24 @@ int io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - struct ios_fd *iosfd = NULL; - char *path = NULL; - struct ios_stat *iosstat = NULL; - struct ios_conf *conf = NULL; - - conf = this->private; - path = frame->local; - frame->local = NULL; - - if (!path) - goto unwind; + struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; + struct ios_conf *conf = NULL; + struct ios_fd *iosfd = NULL; if (op_ret < 0) { - GF_FREE (path); goto unwind; } - iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); - if (!iosfd) { - GF_FREE (path); + local = frame->local; + if (!local) { goto unwind; } - iosfd->filename = path; - gettimeofday (&iosfd->opened_at, NULL); - - ios_fd_ctx_set (fd, this, iosfd); - - ios_inode_ctx_get (fd->inode, this, &iosstat); - if (!iosstat) { - iosstat = GF_CALLOC (1, sizeof (*iosstat), - gf_io_stats_mt_ios_stat); - if (iosstat) { - iosstat->filename = gf_strdup (path); - gf_uuid_copy (iosstat->gfid, fd->inode->gfid); - LOCK_INIT (&iosstat->lock); - ios_inode_ctx_set (fd->inode, this, iosstat); - } + conf = this->private; + ios_build_fd (this, local->loc.path, fd, &iosfd); + if (!iosfd) { + goto unwind; } LOCK (&conf->lock); @@ -1926,13 +2198,19 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&conf->lock); + + ios_inode_ctx_get (fd->inode, this, &iosstat); if (iosstat) { BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN); - iosstat = NULL; } -unwind: - UPDATE_PROFILE_STATS (frame, OPEN); + attach_iosstat_to_inode (this, local->loc.inode, + local->loc.path, + local->loc.inode->gfid); + +unwind: + UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata); return 0; @@ -1943,7 +2221,8 @@ int io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, STAT); + UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -1956,26 +2235,29 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iobref *iobref, dict_t *xdata) { int len = 0; - fd_t *fd = NULL; struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; - fd = frame->local; - frame->local = NULL; + local = frame->local; + if (!local || !local->fd) + goto unwind; if (op_ret > 0) { len = iov_length (vector, count); - BUMP_READ (fd, len); + BUMP_READ (local->fd, len); } - UPDATE_PROFILE_STATS (frame, READ); - ios_inode_ctx_get (fd->inode, this, &iosstat); + UPDATE_PROFILE_STATS (frame, READ, op_ret, op_errno); + ios_inode_ctx_get (local->fd->inode, this, &iosstat); if (iosstat) { - BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); - BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); - iosstat = NULL; + BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); + BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); + } +unwind: + ios_free_local (frame); STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, buf, iobref, xdata); return 0; @@ -1989,21 +2271,23 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { struct ios_stat *iosstat = NULL; + struct ios_local *local = NULL; inode_t *inode = NULL; - UPDATE_PROFILE_STATS (frame, WRITE); - if (frame->local){ - inode = frame->local; - frame->local = NULL; - ios_inode_ctx_get (inode, this, &iosstat); - if (iosstat) { - BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); - BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); - inode = NULL; - iosstat = NULL; - } - } + local = frame->local; + if (!local || !local->fd) + goto unwind; + UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno); + + ios_inode_ctx_get (local->inode, this, &iosstat); + + if (iosstat) { + BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); + BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); + } +unwind: + ios_free_local (frame); STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2021,7 +2305,7 @@ io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, frame->local = NULL; - UPDATE_PROFILE_STATS (frame, READDIRP); + UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno); ios_inode_ctx_get (inode, this, &iosstat); @@ -2039,7 +2323,16 @@ int io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, READDIR); + struct ios_local *local = NULL; + struct ios_stat *iosstat = NULL; + + local = frame->local; + + UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); + + ios_free_local (frame); + + UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2050,8 +2343,10 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSYNC); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); return 0; } @@ -2061,7 +2356,8 @@ io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SETATTR); + UPDATE_PROFILE_STATS (frame, SETATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, xdata); return 0; } @@ -2072,7 +2368,8 @@ io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, UNLINK); + UPDATE_PROFILE_STATS (frame, UNLINK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent, postparent, xdata); return 0; @@ -2086,7 +2383,7 @@ io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preoldparent, struct iatt *postoldparent, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, RENAME); + UPDATE_PROFILE_STATS (frame, RENAME, op_ret, op_errno); STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent, postoldparent, prenewparent, postnewparent, xdata); @@ -2099,7 +2396,8 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *sbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, READLINK); + UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata); return 0; } @@ -2111,7 +2409,14 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xdata, struct iatt *postparent) { - UPDATE_PROFILE_STATS (frame, LOOKUP); + struct ios_local *local = frame->local; + + if (local && local->loc.path && inode && op_ret >= 0) { + attach_iosstat_to_inode (this, inode, local->loc.path, + inode->gfid); + } + UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata, postparent); return 0; @@ -2124,7 +2429,7 @@ io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SYMLINK); + UPDATE_PROFILE_STATS (frame, SYMLINK, op_ret, op_errno); STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2137,7 +2442,7 @@ io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, MKNOD); + UPDATE_PROFILE_STATS (frame, MKNOD, op_ret, op_errno); STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2151,28 +2456,16 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - struct ios_stat *iosstat = NULL; - char *path = frame->local; + struct ios_local *local = frame->local; - if (!path) - goto unwind; - - UPDATE_PROFILE_STATS (frame, MKDIR); - if (op_ret < 0) - goto unwind; - - iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); - if (iosstat) { - LOCK_INIT (&iosstat->lock); - iosstat->filename = gf_strdup(path); - gf_uuid_copy (iosstat->gfid, buf->ia_gfid); - ios_inode_ctx_set (inode, this, iosstat); + if (local && local->loc.path) { + local->inode = inode_ref (inode); + attach_iosstat_to_inode (this, inode, local->loc.path, + buf->ia_gfid); } -unwind: - /* local is assigned with path */ - GF_FREE (frame->local); - frame->local = NULL; + UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2185,7 +2478,7 @@ io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, LINK); + UPDATE_PROFILE_STATS (frame, LINK, op_ret, op_errno); STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf, preparent, postparent, xdata); return 0; @@ -2196,7 +2489,8 @@ int io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FLUSH); + UPDATE_PROFILE_STATS (frame, FLUSH, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata); return 0; } @@ -2206,20 +2500,28 @@ int io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - struct ios_stat *iosstat = NULL; - int ret = -1; + struct ios_local *local = NULL; + struct ios_stat *iosstat = NULL; + int ret = -1; + + local = frame->local; + if (!local || !local->fd) + goto unwind; - UPDATE_PROFILE_STATS (frame, OPENDIR); if (op_ret < 0) goto unwind; - ios_fd_ctx_set (fd, this, 0); + attach_iosstat_to_inode (this, local->inode, local->loc.path, + local->inode->gfid); - ret = ios_inode_ctx_get (fd->inode, this, &iosstat); - if (!ret) + ios_fd_ctx_set (local->fd, this, 0); + ios_inode_ctx_get (local->fd->inode, this, &iosstat); + if (iosstat) BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR); unwind: + UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata); return 0; } @@ -2231,8 +2533,8 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, RMDIR); - + UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent, postparent, xdata); return 0; @@ -2244,7 +2546,8 @@ io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, TRUNCATE); + UPDATE_PROFILE_STATS (frame, TRUNCATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2255,7 +2558,8 @@ int io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, STATFS); + UPDATE_PROFILE_STATS (frame, STATFS, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2265,7 +2569,8 @@ int io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, SETXATTR); + UPDATE_PROFILE_STATS (frame, SETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2275,7 +2580,8 @@ int io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, GETXATTR); + UPDATE_PROFILE_STATS (frame, GETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2285,7 +2591,8 @@ int io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, REMOVEXATTR); + UPDATE_PROFILE_STATS (frame, REMOVEXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2294,7 +2601,8 @@ int io_stats_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSETXATTR); + UPDATE_PROFILE_STATS (frame, FSETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2304,7 +2612,8 @@ int io_stats_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FGETXATTR); + UPDATE_PROFILE_STATS (frame, FGETXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2314,7 +2623,8 @@ int io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FREMOVEXATTR); + UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata); return 0; } @@ -2324,7 +2634,8 @@ int io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSYNCDIR); + UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata); return 0; } @@ -2334,7 +2645,20 @@ int io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, ACCESS); + struct ios_local *local = frame->local; + + /* ACCESS is called before a READ when a fop fails over + * in NFS. We need to make sure that we are attaching the + * data correctly to this inode. + */ + if (local->loc.inode && local->loc.path) { + attach_iosstat_to_inode (this, local->loc.inode, + local->loc.path, + local->loc.inode->gfid); + } + + UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata); return 0; } @@ -2345,7 +2669,8 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FTRUNCATE); + UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2356,7 +2681,8 @@ int io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FSTAT); + UPDATE_PROFILE_STATS (frame, FSTAT, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata); return 0; } @@ -2367,8 +2693,9 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, FALLOCATE); - STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -2379,8 +2706,9 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, DISCARD); - STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno); + ios_free_local (frame); + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; } @@ -2390,7 +2718,8 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - UPDATE_PROFILE_STATS(frame, ZEROFILL); + UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -2400,7 +2729,8 @@ int io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, LK); + UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata); return 0; } @@ -2410,7 +2740,8 @@ int io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, ENTRYLK); + UPDATE_PROFILE_STATS (frame, ENTRYLK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata); return 0; } @@ -2420,7 +2751,8 @@ int io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, XATTROP); + UPDATE_PROFILE_STATS (frame, XATTROP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2430,7 +2762,8 @@ int io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, FXATTROP); + UPDATE_PROFILE_STATS (frame, FXATTROP, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata); return 0; } @@ -2440,7 +2773,8 @@ int io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - UPDATE_PROFILE_STATS (frame, INODELK); + UPDATE_PROFILE_STATS (frame, INODELK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata); return 0; } @@ -2450,6 +2784,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { + ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_entrylk_cbk, @@ -2464,6 +2800,7 @@ int io_stats_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); @@ -2479,8 +2816,8 @@ int io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - - UPDATE_PROFILE_STATS (frame, FINODELK); + UPDATE_PROFILE_STATS (frame, FINODELK, op_ret, op_errno); + ios_free_local (frame); STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata); return 0; } @@ -2490,6 +2827,7 @@ int io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_finodelk_cbk, @@ -2504,6 +2842,7 @@ int io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_xattrop_cbk, @@ -2518,6 +2857,7 @@ int io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fxattrop_cbk, @@ -2532,6 +2872,7 @@ int io_stats_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_lookup_cbk, @@ -2545,6 +2886,7 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this, int io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_stat_cbk, @@ -2559,6 +2901,7 @@ int io_stats_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_readlink_cbk, @@ -2573,6 +2916,7 @@ int io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_mknod_cbk, @@ -2587,9 +2931,7 @@ int io_stats_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { - if (loc->path) - frame->local = gf_strdup (loc->path); - + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_mkdir_cbk, @@ -2604,6 +2946,7 @@ int io_stats_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_unlink_cbk, @@ -2618,6 +2961,7 @@ int io_stats_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_rmdir_cbk, @@ -2674,6 +3018,7 @@ int io_stats_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_setattr_cbk, @@ -2688,6 +3033,7 @@ int io_stats_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_truncate_cbk, @@ -2702,8 +3048,8 @@ int io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - if (loc->path) - frame->local = gf_strdup (loc->path); + ios_track_loc (frame, loc); + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); @@ -2719,9 +3065,10 @@ int io_stats_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) + { - if (loc->path) - frame->local = gf_strdup (loc->path); + ios_track_loc (frame, loc); + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); @@ -2737,8 +3084,7 @@ int io_stats_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - frame->local = fd; - + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_readv_cbk, @@ -2756,9 +3102,12 @@ io_stats_writev (call_frame_t *frame, xlator_t *this, uint32_t flags, struct iobref *iobref, dict_t *xdata) { int len = 0; + struct ios_conf *conf = NULL; + struct ios_local *local = NULL; + int ret = 0; + + ios_track_fd (frame, fd); - if (fd->inode) - frame->local = fd->inode; len = iov_length (vector, count); BUMP_WRITE (fd, len); @@ -2777,6 +3126,7 @@ int io_stats_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_statfs_cbk, @@ -2791,6 +3141,7 @@ int io_stats_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_flush_cbk, @@ -2805,6 +3156,7 @@ int io_stats_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fsync_cbk, @@ -2971,7 +3323,7 @@ _ios_dump_thread (xlator_t *this) { stats_filename, strerror(errno)); log_stats_fopen_failure = _gf_false; } - samples_logfp = fopen (samples_filename, "w+"); + samples_logfp = fopen (samples_filename, "a"); if (samples_logfp) { io_stats_dump_latency_samples_logfp (this, samples_logfp); @@ -3024,6 +3376,8 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this, goto out; } + ios_track_loc (frame, loc); + START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_setxattr_cbk, @@ -3042,6 +3396,7 @@ int io_stats_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_getxattr_cbk, @@ -3056,6 +3411,7 @@ int io_stats_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_removexattr_cbk, @@ -3071,6 +3427,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fsetxattr_cbk, @@ -3085,6 +3442,7 @@ int io_stats_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fgetxattr_cbk, @@ -3099,6 +3457,7 @@ int io_stats_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fremovexattr_cbk, @@ -3170,6 +3529,7 @@ int io_stats_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, dict_t *xdata) { + ios_track_loc (frame, loc); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_access_cbk, @@ -3212,6 +3572,7 @@ int io_stats_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { + ios_track_fd (frame, fd); START_FOP_LATENCY (frame); STACK_WIND (frame, io_stats_fstat_cbk, diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c index 270632bc71b..2eb3a9f9149 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc, case RPC_CLNT_DISCONNECT: case RPC_CLNT_MSG: case RPC_CLNT_DESTROY: + case RPC_CLNT_PING: break; } diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c index 77637c7beec..459d173db7f 100644 --- a/xlators/features/changelog/src/changelog-ev-handle.c +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -180,6 +180,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc, /* Free up mydata */ changelog_rpc_clnt_unref (crpc); break; + case RPC_CLNT_PING: + break; } return 0; diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 640c6bb5553..d7c210f24a5 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -234,6 +234,7 @@ blkd: continue; bcount++; + list_del_init (&ilock->client_list); list_del_init (&ilock->blocked_locks); list_add (&ilock->blocked_locks, &released); } @@ -268,6 +269,7 @@ granted: continue; gcount++; + list_del_init (&ilock->client_list); list_del_init (&ilock->list); list_add (&ilock->list, &released); } @@ -321,6 +323,7 @@ blkd: bcount++; + list_del_init (&elock->client_list); list_del_init (&elock->blocked_locks); list_add_tail (&elock->blocked_locks, &released); } @@ -355,6 +358,7 @@ granted: } gcount++; + list_del_init (&elock->client_list); list_del_init (&elock->domain_list); list_add_tail (&elock->domain_list, &removed); diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 68904f63140..8a56c4205d9 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -1108,3 +1108,16 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) return conf; } + +gf_boolean_t +pl_does_monkey_want_stuck_lock() +{ + long int monkey_unlock_rand = 0; + long int monkey_unlock_rand_rem = 0; + + monkey_unlock_rand = random (); + monkey_unlock_rand_rem = monkey_unlock_rand % 100; + if (monkey_unlock_rand_rem == 0) + return _gf_true; + return _gf_false; +} diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 5486f9b8314..3729ca24bed 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -161,4 +161,7 @@ pl_metalock_is_active (pl_inode_t *pl_inode); int __pl_queue_lock (pl_inode_t *pl_inode, posix_lock_t *reqlock, int can_block); + +gf_boolean_t +pl_does_monkey_want_stuck_lock(); #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 783c57e6381..626541237b3 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -16,9 +16,9 @@ #include "list.h" #include "locks.h" +#include "clear.h" #include "common.h" - void __pl_entrylk_unref (pl_entry_lock_t *lock) { @@ -111,6 +111,97 @@ __conflicting_entrylks (pl_entry_lock_t *l1, pl_entry_lock_t *l2) return 0; } +/* See comments in inodelk.c for details */ +static inline gf_boolean_t +__stale_entrylk (xlator_t *this, pl_entry_lock_t *candidate_lock, + pl_entry_lock_t *requested_lock, time_t *lock_age_sec) +{ + posix_locks_private_t *priv = NULL; + struct timeval curr; + gettimeofday (&curr, NULL); + + priv = this->private; + + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (names_conflict (candidate_lock->basename, + requested_lock->basename)) { + *lock_age_sec = curr.tv_sec - + candidate_lock->granted_time.tv_sec; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; +} + +/* See comments in inodelk.c for details */ +static gf_boolean_t +__entrylk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_entry_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + args.type = CLRLK_ENTRY; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + + if (list_empty (&dom->entrylk_list)) + goto out; + + pthread_mutex_lock (&pinode->mutex); + lock->pinode = pinode; + list_for_each_entry_safe (lk, tmp, &dom->entrylk_list, domain_list) { + if (__stale_entrylk (this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe (lk, tmp, &dom->blocked_entrylks, + blocked_locks) { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock (&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_entrylk (this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log (this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Entry lock revoked: %d granted & %d " + "blocked locks cleared", reason_str, + uuid_utoa (pinode->gfid), dom->domain, lk_age_sec, + gcount, bcount); + } + + return revoke_lock; +} + /** * entrylk_grantable - is this lock grantable? * @inode: inode in which to look @@ -546,6 +637,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pl_ctx_t *ctx = NULL; int nonblock = 0; gf_boolean_t need_inode_unref = _gf_false; + posix_locks_private_t *priv = NULL; + + priv = this->private; if (xdata) dict_ret = dict_get_str (xdata, "connection-id", &conn_id); @@ -599,6 +693,18 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, * current stack unwinds. */ pinode->inode = inode_ref (inode); + if (priv->revocation_secs != 0) { + if (cmd != ENTRYLK_UNLOCK) { + __entrylk_prune_stale (this, pinode, dom, reqlock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock ()) { + gf_log (this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + op_ret = 0; + goto out; + } + } + } switch (cmd) { case ENTRYLK_LOCK_NB: @@ -678,8 +784,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, "a bug report at http://bugs.gluster.com", cmd); goto out; } - if (need_inode_unref) - inode_unref (pinode->inode); /* The following (extra) unref corresponds to the ref that * was done at the time the lock was granted. @@ -689,6 +793,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, out: + if (need_inode_unref) + inode_unref (pinode->inode); + if (unwind) { entrylk_trace_out (this, frame, volume, fd, loc, basename, cmd, type, op_ret, op_errno); @@ -810,6 +917,8 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) * blocked to avoid leaving L1 to starve forever. * iv. unref the object. */ + list_del_init (&l->client_list); + if (!list_empty (&l->domain_list)) { list_del_init (&l->domain_list); list_add_tail (&l->client_list, diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 1564f26b8fb..275fb9d20e4 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -16,6 +16,7 @@ #include "list.h" #include "locks.h" +#include "clear.h" #include "common.h" void @@ -130,6 +131,105 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2) inodelk_type_conflict (l1, l2)); } +/* + * Check to see if the candidate lock overlaps/conflicts with the + * requested lock. If so, determine how old the lock is and return + * true if it exceeds the configured threshold, false otherwise. + */ +static inline gf_boolean_t +__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock, + pl_inode_lock_t *requested_lock, time_t *lock_age_sec) +{ + posix_locks_private_t *priv = NULL; + struct timeval curr; + + priv = this->private; + gettimeofday (&curr, NULL); + /* Question: Should we just prune them all given the + * chance? Or just the locks we are attempting to acquire? + */ + if (inodelk_conflict (candidate_lock, requested_lock)) { + *lock_age_sec = curr.tv_sec - + candidate_lock->granted_time.tv_sec; + if (*lock_age_sec > priv->revocation_secs) + return _gf_true; + } + return _gf_false; +} + +/* Examine any locks held on this inode and potentially revoke the lock + * if the age exceeds revocation_secs. We will clear _only_ those locks + * which are granted, and then grant those locks which are blocked. + * + * Depending on how this patch works in the wild, we may expand this and + * introduce a heuristic which clears blocked locks as well if they + * are beyond a threshold. + */ +static gf_boolean_t +__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, + pl_inode_lock_t *lock) +{ + posix_locks_private_t *priv = NULL; + pl_inode_lock_t *tmp = NULL; + pl_inode_lock_t *lk = NULL; + gf_boolean_t revoke_lock = _gf_false; + int bcount = 0; + int gcount = 0; + int op_errno = 0; + clrlk_args args; + args.opts = NULL; + time_t lk_age_sec = 0; + uint32_t max_blocked = 0; + char *reason_str = NULL; + + priv = this->private; + + args.type = CLRLK_INODE; + if (priv->revocation_clear_all == _gf_true) + args.kind = CLRLK_ALL; + else + args.kind = CLRLK_GRANTED; + + if (list_empty (&dom->inodelk_list)) + goto out; + + pthread_mutex_lock (&pinode->mutex); + list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) { + if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) { + revoke_lock = _gf_true; + reason_str = "age"; + break; + } + } + + max_blocked = priv->revocation_max_blocked; + if (max_blocked != 0 && revoke_lock == _gf_false) { + list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks, + blocked_locks) { + max_blocked--; + if (max_blocked == 0) { + revoke_lock = _gf_true; + reason_str = "max blocked"; + break; + } + } + } + pthread_mutex_unlock (&pinode->mutex); + +out: + if (revoke_lock == _gf_true) { + clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount, + &op_errno); + gf_log (this->name, GF_LOG_WARNING, + "Lock revocation [reason: %s; gfid: %s; domain: %s; " + "age: %ld sec] - Inode lock revoked: %d granted & %d " + "blocked locks cleared", + reason_str, uuid_utoa (pinode->gfid), dom->domain, + lk_age_sec, gcount, bcount); + } + return revoke_lock; +} + /* Determine if lock is grantable or not */ static pl_inode_lock_t * __inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) @@ -419,8 +519,6 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) { list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers, client_list) { - list_del_init (&l->client_list); - pl_inodelk_log_cleanup (l); pl_inode = l->pl_inode; @@ -458,6 +556,8 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx) * forever. * iv. unref the object. */ + list_del_init (&l->client_list); + if (!list_empty (&l->list)) { __delete_inode_lock (l); list_add_tail (&l->client_list, @@ -509,6 +609,7 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom, inode_t *inode) { + posix_locks_private_t *priv = NULL; int ret = -EINVAL; pl_inode_lock_t *retlock = NULL; gf_boolean_t unref = _gf_true; @@ -518,6 +619,8 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, lock->pl_inode = pl_inode; fl_type = lock->fl_type; + priv = this->private; + /* Ideally, AFTER a successful lock (both blocking and non-blocking) or * an unsuccessful blocking lock operation, the inode needs to be ref'd. * @@ -537,6 +640,18 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, */ pl_inode->inode = inode_ref (inode); + if (priv->revocation_secs != 0) { + if (lock->fl_type != F_UNLCK) { + __inodelk_prune_stale (this, pl_inode, dom, lock); + } else if (priv->monkey_unlocking == _gf_true) { + if (pl_does_monkey_want_stuck_lock ()) { + gf_log (this->name, GF_LOG_WARNING, + "MONKEY LOCKING (forcing stuck lock)!"); + return 0; + } + } + } + if (ctx) pthread_mutex_lock (&ctx->lock); pthread_mutex_lock (&pl_inode->mutex); diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index e363f425b65..8eb35da44be 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -190,6 +190,10 @@ typedef struct { mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */ gf_boolean_t trace; /* trace lock requests in and out */ char *brickname; + gf_boolean_t monkey_unlocking; + uint32_t revocation_secs; + gf_boolean_t revocation_clear_all; + uint32_t revocation_max_blocked; } posix_locks_private_t; diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index f217220a04b..616be0f7cff 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -3627,7 +3627,21 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("trace", priv->trace, options, bool, out); + GF_OPTION_RECONF ("monkey-unlocking", priv->monkey_unlocking, options, + bool, out); + + GF_OPTION_RECONF ("revocation-secs", + priv->revocation_secs, options, + uint32, out); + + GF_OPTION_RECONF ("revocation-clear-all", priv->revocation_clear_all, + options, bool, out); + + GF_OPTION_RECONF ("revocation-max-blocked", + priv->revocation_max_blocked, options, + uint32, out); ret = 0; + out: return ret; } @@ -3678,6 +3692,18 @@ init (xlator_t *this) GF_OPTION_INIT ("trace", priv->trace, bool, out); + GF_OPTION_INIT ("monkey-unlocking", priv->monkey_unlocking, + bool, out); + + GF_OPTION_INIT ("revocation-secs", priv->revocation_secs, + uint32, out); + + GF_OPTION_INIT ("revocation-clear-all", priv->revocation_clear_all, + bool, out); + + GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked, + uint32, out); + this->local_pool = mem_pool_new (pl_local_t, 32); if (!this->local_pool) { ret = -1; @@ -3934,5 +3960,35 @@ struct volume_options options[] = { .description = "Trace the different lock requests " "to logs." }, + { .key = { "monkey-unlocking" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Ignore a random number of unlock requests. Useful " + "for testing/creating robust lock recovery mechanisms." + }, + { .key = {"revocation-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "Maximum time a lock can be taken out, before" + "being revoked.", + }, + { .key = {"revocation-clear-all"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "If set to true, will revoke BOTH granted and blocked " + "(pending) lock requests if a revocation threshold is " + "hit.", + }, + { .key = {"revocation-max-blocked"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "A number of blocked lock requests after which a lock " + "will be revoked to allow the others to proceed. Can " + "be used in conjunction w/ revocation-clear-all." + }, { .key = {NULL} }, }; diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c index f578f6c3f44..9201f38f7ff 100644 --- a/xlators/features/marker/src/marker.c +++ b/xlators/features/marker/src/marker.c @@ -390,13 +390,6 @@ _is_quota_internal_xattr (dict_t *d, char *k, data_t *v, void *data) if (fnmatch ("trusted.glusterfs.quota*", k, 0) == 0) return _gf_true; - /* It would be nice if posix filters pgfid xattrs. But since marker - * also takes up responsibility to clean these up, adding the filtering - * here (Check 'quota_xattr_cleaner') - */ - if (fnmatch (PGFID_XATTR_KEY_PREFIX"*", k, 0) == 0) - return _gf_true; - return _gf_false; } @@ -1598,9 +1591,10 @@ marker_get_oldpath_contribution (call_frame_t *lk_frame, void *cookie, */ MARKER_SET_UID_GID (frame, local, frame->root); - if (gf_uuid_is_null (oplocal->loc.gfid)) - gf_uuid_copy (oplocal->loc.gfid, - oplocal->loc.inode->gfid); + if (gf_uuid_is_null (oplocal->loc.gfid)) { + gf_uuid_copy (oplocal->loc.gfid, + oplocal->loc.inode->gfid); + } GF_UUID_ASSERT (oplocal->loc.gfid); diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index dd7bf809e21..2e68b318a9c 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -2200,7 +2200,7 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_INFO, EINVAL, Q_MSG_INODE_CTX_GET_FAILED, "quota context not set inode (gfid:%s)", - uuid_utoa (local->loc.inode->gfid)); + uuid_utoa (local->loc.gfid)); goto out; } diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c index fc2ff2ab10d..f5062971bf4 100644 --- a/xlators/features/snapview-server/src/snapview-server-mgmt.c +++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c @@ -73,7 +73,7 @@ svs_mgmt_init (xlator_t *this) if (cmd_args->volfile_server) host = cmd_args->volfile_server; - ret = rpc_transport_inet_options_build (&options, host, port); + ret = rpc_transport_inet_options_build (&options, host, port, NULL); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to build the " "transport options"); diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index bf62290d023..1770d9dd874 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -3418,7 +3418,7 @@ glusterd_transport_keepalive_options_get (int *interval, int *time, int glusterd_transport_inet_options_build (dict_t **options, const char *hostname, - int port) + int port, char *addr_family) { dict_t *dict = NULL; int32_t interval = -1; @@ -3433,7 +3433,8 @@ glusterd_transport_inet_options_build (dict_t **options, const char *hostname, port = GLUSTERD_DEFAULT_PORT; /* Build default transport options */ - ret = rpc_transport_inet_options_build (&dict, hostname, port); + ret = rpc_transport_inet_options_build (&dict, hostname, port, + addr_family); if (ret) goto out; @@ -3470,6 +3471,7 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo, int ret = -1; glusterd_peerctx_t *peerctx = NULL; data_t *data = NULL; + char *addr_family = NULL; peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t); if (!peerctx) @@ -3485,9 +3487,15 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo, uniquely identify a peerinfo */ + if (dict_get_str(this->options, "transport.address-family", + &addr_family)) { + addr_family = NULL; + } + ret = glusterd_transport_inet_options_build (&options, peerinfo->hostname, - peerinfo->port); + peerinfo->port, + addr_family); if (ret) goto out; @@ -5157,11 +5165,16 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata, this = THIS; conf = this->private; - if (RPC_CLNT_DESTROY == event) { + switch (event) { + case RPC_CLNT_DESTROY: GF_FREE (peerctx->errstr); GF_FREE (peerctx->peername); GF_FREE (peerctx); return 0; + case RPC_CLNT_PING: + return 0; + default: + break; } rcu_read_lock (); diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c index 0ea66a027bf..4fdff3402f5 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handshake.c +++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c @@ -241,6 +241,50 @@ build_volfile_path (char *volume_id, char *path, } + volid_ptr = strstr (volume_id, "gfproxy-client/"); + if (volid_ptr) { + volid_ptr = strchr (volid_ptr, '/'); + if (!volid_ptr) { + ret = -1; + goto out; + } + volid_ptr++; + + ret = glusterd_volinfo_find (volid_ptr, &volinfo); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't find volinfo"); + goto out; + } + + glusterd_get_gfproxy_client_volfile (volinfo, path, path_len); + + ret = 0; + goto out; + } + + volid_ptr = strstr (volume_id, "gfproxy/"); + if (volid_ptr) { + volid_ptr = strchr (volid_ptr, '/'); + if (!volid_ptr) { + ret = -1; + goto out; + } + volid_ptr++; + + ret = glusterd_volinfo_find (volid_ptr, &volinfo); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Couldn't find volinfo"); + goto out; + } + + glusterd_get_gfproxyd_volfile (volinfo, path, path_len); + + ret = 0; + goto out; + } + volid_ptr = strstr (volume_id, "/snaps/"); if (volid_ptr) { ret = get_snap_volname_and_volinfo (volid_ptr, &volname, diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index c7100cab70b..e303937579e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -1791,6 +1791,7 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, int port = 0; int rdma_port = 0; char *bind_address = NULL; + char *addr_family = NULL; char socketpath[PATH_MAX] = {0}; char glusterd_uuid[1024] = {0,}; char valgrind_logfile[PATH_MAX] = {0}; @@ -1913,6 +1914,13 @@ retry: bind_address); } + if (dict_get_str (this->options, "transport.address-family", + &addr_family) == 0) { + runner_add_arg (&runner, "--xlator-option"); + runner_argprintf (&runner, "*.transport.address-family=%s", + addr_family); + } + if (volinfo->transport_type == GF_TRANSPORT_RDMA) runner_argprintf (&runner, "--volfile-server-transport=rdma"); else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA) @@ -10791,6 +10799,45 @@ out: } void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len) +{ + char workdir[PATH_MAX] = {0, }; + glusterd_conf_t *priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + + switch (volinfo->transport_type) { + case GF_TRANSPORT_TCP: + snprintf (path, path_len, + "%s/trusted-%s.tcp-gfproxy-fuse.vol", + workdir, volinfo->volname); + break; + + case GF_TRANSPORT_RDMA: + snprintf (path, path_len, + "%s/trusted-%s.rdma-gfproxy-fuse.vol", + workdir, volinfo->volname); + break; + default: + break; + } +} + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len) +{ + char workdir[PATH_MAX] = {0, }; + glusterd_conf_t *priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + + snprintf (path, path_len, "%s/%s.gfproxyd.vol", workdir, + volinfo->volname); +} + +void glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo, char *path, int path_len) { diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index f4c4138829f..7445407c010 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -642,6 +642,14 @@ void glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo, char *path, int path_len); +void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, + char *path, int path_len); + int32_t glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo, glusterd_brickinfo_t *dup_brickinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 25fb23f72b2..1f087b43ab4 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -58,6 +58,20 @@ extern struct volopt_map_entry glusterd_volopt_map[]; } \ } while (0 /* CONSTCOND */) +/** + * Needed for GFProxy + */ +#define GF_PROXY_DAEMON_PORT 40000 +#define GF_PROXY_DAEMON_PORT_STR "40000" + +static int +volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *set_dict, void *param); + +static int +build_client_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, + dict_t *mod_dict); + /********************************************* * * xlator generation / graph manipulation API @@ -1448,6 +1462,75 @@ server_spec_extended_option_handler (volgen_graph_t *graph, static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo); static int +gfproxy_server_graph_builder (volgen_graph_t *graph, + glusterd_volinfo_t *volinfo, + dict_t *set_dict, void *param) +{ + xlator_t *xl = NULL; + char *value = NULL; + char transt[16] = {0, }; + char key[1024] = {0, }; + char port_str[7] = {0, }; + int ret = 0; + char *username = NULL; + char *password = NULL; + int rclusters = 0; + + /* We are a trusted client */ + ret = dict_set_uint32 (set_dict, "trusted-client", GF_CLIENT_TRUSTED); + if (ret != 0) + goto out; + + ret = dict_set_str (set_dict, "gfproxy-server", "on"); + if (ret != 0) + goto out; + + /* Build the client section of the graph first */ + build_client_graph (graph, volinfo, set_dict); + + /* Clear this setting so that future users of set_dict do not end up + * thinking they are a gfproxy server */ + dict_del (set_dict, "gfproxy-server"); + dict_del (set_dict, "trusted-client"); + + /* Then add the server to it */ + get_vol_transport_type (volinfo, transt); + xl = volgen_graph_add (graph, "protocol/server", volinfo->volname); + if (!xl) + goto out; + + ret = xlator_set_option (xl, "listen-port", GF_PROXY_DAEMON_PORT_STR); + if (ret != 0) + goto out; + + ret = xlator_set_option (xl, "transport-type", transt); + if (ret != 0) + goto out; + + /* Set username and password */ + username = glusterd_auth_get_username (volinfo); + password = glusterd_auth_get_password (volinfo); + if (username) { + snprintf (key, sizeof (key), "auth.login.%s-server.allow", + volinfo->volname); + ret = xlator_set_option (xl, key, username); + if (ret) + return -1; + } + + if (password) { + snprintf (key, sizeof (key), "auth.login.%s.password", + username); + ret = xlator_set_option (xl, key, password); + if (ret != 0) + goto out; + } + +out: + return ret; +} + +static int brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, dict_t *set_dict, glusterd_brickinfo_t *brickinfo) { @@ -2541,6 +2624,48 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, } static int +gfproxy_server_perfxl_option_handler (volgen_graph_t *graph, + struct volopt_map_entry *vme, + void *param) +{ + gf_boolean_t enabled = _gf_false; + glusterd_volinfo_t *volinfo = NULL; + + GF_ASSERT (param); + volinfo = param; + + /* write-behind is the *not* allowed for gfproxy-servers */ + if (strstr (vme->key, "write-behind")) { + return 0; + } + + perfxl_option_handler (graph, vme, param); + + return 0; +} + +static int +gfproxy_client_perfxl_option_handler (volgen_graph_t *graph, + struct volopt_map_entry *vme, + void *param) +{ + gf_boolean_t enabled = _gf_false; + glusterd_volinfo_t *volinfo = NULL; + + GF_ASSERT (param); + volinfo = param; + + /* write-behind is the only allowed "perf" for gfproxy-clients */ + if (!strstr (vme->key, "write-behind")) + return 0; + + perfxl_option_handler (graph, vme, param); + + return 0; +} + + +static int nfsperfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, void *param) { @@ -2768,8 +2893,10 @@ _free_xlator_opt_key (char *key) } static xlator_t * -volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, - char *hostname, char *subvol, char *xl_id, +volgen_graph_build_client (volgen_graph_t *graph, + glusterd_volinfo_t *volinfo, + char *hostname, char *port, + char *subvol, char *xl_id, char *transt, dict_t *set_dict) { xlator_t *xl = NULL; @@ -2801,6 +2928,12 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, goto err; } + if (port) { + ret = xlator_set_option (xl, "remote-port", port); + if (ret) + goto err; + } + ret = xlator_set_option (xl, "remote-subvolume", subvol); if (ret) goto err; @@ -2824,7 +2957,8 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, ret = dict_get_uint32 (set_dict, "trusted-client", &client_type); - if (!ret && client_type == GF_CLIENT_TRUSTED) { + if (!ret && (client_type == GF_CLIENT_TRUSTED + || client_type == GF_CLIENT_TRUSTED_PROXY)) { str = NULL; str = glusterd_auth_get_username (volinfo); if (str) { @@ -2911,7 +3045,9 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, i = 0; cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) { xl = volgen_graph_build_client (graph, volinfo, - brick->hostname, brick->path, + brick->hostname, + NULL, + brick->path, brick->brick_id, transt, set_dict); if (!xl) { @@ -3143,8 +3279,9 @@ volgen_graph_build_snapview_client (volgen_graph_t *graph, get_transport_type (volinfo, set_dict, transt, _gf_false); - prot_clnt = volgen_graph_build_client (graph, volinfo, NULL, subvol, - xl_id, transt, set_dict); + prot_clnt = volgen_graph_build_client (graph, volinfo, + NULL, NULL, subvol, + xl_id, transt, set_dict); if (!prot_clnt) { ret = -1; goto out; @@ -3555,6 +3692,27 @@ static int client_graph_set_perf_options(volgen_graph_t *graph, { data_t *tmp_data = NULL; char *volname = NULL; + int ret = 0; + + /* + * Logic to make sure gfproxy-client gets custom performance translators + */ + ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); + if (ret == 1) { + return volgen_graph_set_options_generic ( + graph, set_dict, volinfo, + &gfproxy_client_perfxl_option_handler); + } + + /* + * Logic to make sure gfproxy-server gets custom performance translators + */ + ret = dict_get_str_boolean (set_dict, "gfproxy-server", 0); + if (ret == 1) { + return volgen_graph_set_options_generic ( + graph, set_dict, volinfo, + &gfproxy_server_perfxl_option_handler); + } /* * Logic to make sure NFS doesn't have performance translators by @@ -3768,29 +3926,55 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, char *volname = NULL; glusterd_conf_t *conf = THIS->private; char *tmp = NULL; + char *hostname = NULL; gf_boolean_t var = _gf_false; gf_boolean_t ob = _gf_false; + gf_boolean_t is_gfproxy = _gf_false; int uss_enabled = -1; xlator_t *this = THIS; + char *subvol = NULL; + size_t subvol_namelen = 0; GF_ASSERT (this); GF_ASSERT (conf); - volname = volinfo->volname; - ret = volgen_graph_build_clients (graph, volinfo, set_dict, - param); - if (ret) + ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); + if (ret == -1) goto out; - if (volinfo->type == GF_CLUSTER_TYPE_TIER) - ret = volume_volgen_graph_build_clusters_tier - (graph, volinfo, _gf_false); - else - ret = volume_volgen_graph_build_clusters - (graph, volinfo, _gf_false); + volname = volinfo->volname; + if (ret == 0) { + ret = volgen_graph_build_clients (graph, volinfo, set_dict, + param); + if (ret) + goto out; - if (ret == -1) - goto out; + if (volinfo->type == GF_CLUSTER_TYPE_TIER) + ret = volume_volgen_graph_build_clusters_tier + (graph, volinfo, _gf_false); + else + ret = volume_volgen_graph_build_clusters + (graph, volinfo, _gf_false); + + if (ret == -1) + goto out; + } else { + is_gfproxy = _gf_true; + ret = dict_get_str (set_dict, + "config.gfproxyd-remote-host", &tmp); + if (ret == -1) + goto out; + + subvol_namelen = strlen (volinfo->volname) + + strlen ("-server") + 1; + subvol = alloca (subvol_namelen); + snprintf (subvol, subvol_namelen, + "%s-server", volinfo->volname); + + volgen_graph_build_client (graph, volinfo, tmp, + GF_PROXY_DAEMON_PORT_STR, subvol, + "gfproxy", "tcp", set_dict); + } ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false); if (ret == -1) @@ -3851,6 +4035,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } } + /* gfproxy needs the AHA translator */ + if (is_gfproxy) { + xl = volgen_graph_add (graph, "cluster/aha", volname); + if (!xl) { + ret = -1; + goto out; + } + } + if (conf->op_version == GD_OP_VERSION_MIN) { ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA); @@ -4300,6 +4493,19 @@ nfs_option_handler (volgen_graph_t *graph, return -1; } + if (! strcmp (vme->option, "!nfs.*.exports-auth-enable")) { + ret = gf_asprintf (&aa, "nfs.%s.exports-auth-enable", + volinfo->volname); + + if (ret != -1) { + ret = xlator_set_option (xl, aa, vme->value); + GF_FREE (aa); + } + + if (ret) + return -1; + } + if ((strcmp (vme->voltype, "nfs/server") == 0) && (vme->option && vme->option[0]!='!') ) { ret = xlator_set_option (xl, vme->option, vme->value); @@ -4348,8 +4554,12 @@ volgen_get_shd_key (int type) static gf_boolean_t volgen_is_shd_compatible_xl (char *xl_type) { - char *shd_xls[] = {"cluster/replicate", "cluster/disperse", - NULL}; + char *shd_xls[] = { + "cluster/replicate", + "cluster/disperse", + "debug/io-stats", + NULL + }; if (gf_get_index_by_elem (shd_xls, xl_type) != -1) return _gf_true; @@ -4731,6 +4941,24 @@ out: return ret; } +static int +volgen_graph_set_iam_nfsd (const volgen_graph_t *graph) +{ + xlator_t *trav; + int ret = 0; + + for (trav = first_of ((volgen_graph_t *)graph); trav; + trav = trav->next) { + if (strcmp (trav->type, "cluster/replicate") != 0) + continue; + + ret = xlator_set_option (trav, "iam-nfs-daemon", "yes"); + if (ret) + break; + } + return ret; +} + /* builds a graph for nfs server role, with option overrides in mod_dict */ int build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) @@ -4869,6 +5097,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) if (ret) goto out; + ret = volgen_graph_set_iam_nfsd (&cgraph); + if (ret) + goto out; + ret = volgen_graph_merge_sub (graph, &cgraph, 1); if (ret) goto out; @@ -4930,6 +5162,22 @@ get_brick_filepath (char *filename, glusterd_volinfo_t *volinfo, brickinfo->hostname, brick); } +static void +get_gfproxyd_filepath (char *filename, glusterd_volinfo_t *volinfo) +{ + char path[PATH_MAX] = {0, }; + char brick[PATH_MAX] = {0, }; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + + GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv); + + snprintf (filename, PATH_MAX, + "%s/%s.gfproxyd.vol", path, + volinfo->volname); +} + gf_boolean_t glusterd_is_valid_volfpath (char *volname, char *brick) { @@ -4975,6 +5223,32 @@ out: } static int +glusterd_generate_gfproxyd_volfile (glusterd_volinfo_t *volinfo) +{ + volgen_graph_t graph = {0, }; + char filename[PATH_MAX] = {0, }; + int ret = -1; + + GF_ASSERT (volinfo); + + get_gfproxyd_filepath (filename, volinfo); + + struct glusterd_gfproxyd_info info = { + .port = GF_PROXY_DAEMON_PORT, + }; + + ret = build_graph_generic (&graph, volinfo, + NULL, &info, + &gfproxy_server_graph_builder); + if (ret == 0) + ret = volgen_write_volfile (&graph, filename); + + volgen_graph_free (&graph); + + return ret; +} + +static int glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo, dict_t *mod_dict, void *data) @@ -5245,7 +5519,8 @@ glusterd_generate_client_per_brick_volfile (glusterd_volinfo_t *volinfo) cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) { xl = volgen_graph_build_client (&graph, volinfo, - brick->hostname, brick->path, + brick->hostname, + NULL, brick->path, brick->brick_id, "tcp", dict); if (!xl) { @@ -5376,6 +5651,11 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo, ret = glusterd_get_trusted_client_filepath (filepath, volinfo, type); + } else if (client_type == GF_CLIENT_TRUSTED_PROXY) { + glusterd_get_gfproxy_client_volfile (volinfo, + filepath, + PATH_MAX); + ret = dict_set_str (dict, "gfproxy-client", "on"); } else { ret = glusterd_get_client_filepath (filepath, volinfo, @@ -5620,6 +5900,7 @@ build_bitd_volume_graph (volgen_graph_t *graph, xl = volgen_graph_build_client (&cgraph, volinfo, brickinfo->hostname, + NULL, brickinfo->path, brickinfo->brick_id, transt, set_dict); @@ -5782,6 +6063,7 @@ build_scrub_volume_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, xl = volgen_graph_build_client (&cgraph, volinfo, brickinfo->hostname, + NULL, brickinfo->path, brickinfo->brick_id, transt, set_dict); @@ -5913,12 +6195,25 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo) goto out; } + ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED_PROXY); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not generate gfproxy client volfiles"); + goto out; + } + ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER); if (ret) gf_msg (this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL, "Could not generate client volfiles"); + + ret = glusterd_generate_gfproxyd_volfile (volinfo); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Could not generate gfproxy volfiles"); + out: return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index f90177372dc..cb2cad50efc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -52,7 +52,8 @@ typedef enum { GF_CLIENT_TRUSTED, - GF_CLIENT_OTHER + GF_CLIENT_OTHER, + GF_CLIENT_TRUSTED_PROXY, } glusterd_client_type_t; struct volgen_graph { diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 62de6b31b64..8f2a23a898a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -286,6 +286,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) int32_t type = 0; char *username = NULL; char *password = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif GF_ASSERT (req); @@ -388,10 +393,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) /* Setting default as inet for trans_type tcp */ ret = dict_set_dynstr_with_alloc (dict, "transport.address-family", - "inet"); + addr_family); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "failed to set transport.address-family"); + "failed to set transport.address-family " + "to %s", addr_family); goto out; } } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 1e24adabe0c..d29f32d1963 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -11,6 +11,7 @@ #include "glusterd-volgen.h" #include "glusterd-utils.h" +#if USE_GFDB static int get_tier_freq_threshold (glusterd_volinfo_t *volinfo, char *threshold_key) { int threshold = 0; @@ -473,6 +474,7 @@ out: return ret; } +#endif static int validate_cache_max_min_size (glusterd_volinfo_t *volinfo, dict_t *dict, @@ -1048,6 +1050,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.min-free-strict-mode", + .voltype = "cluster/distribute", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.min-free-inodes", .voltype = "cluster/distribute", .op_version = 1, @@ -1113,6 +1120,13 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT, }, + { .key = "cluster.du-refresh-interval-sec", + .voltype = "cluster/distribute", + .option = "du-refresh-interval-sec", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, + /* NUFA xlator options (Distribute special case) */ { .key = "cluster.nufa", .voltype = "cluster/distribute", @@ -1299,6 +1313,13 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_12, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.pgfid-self-heal", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT, + .description = "Use PGFID attribute if available to remediate " + "failed heals." + }, /* stripe xlator options */ { .key = "cluster.stripe-block-size", @@ -1454,6 +1475,18 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "performance.statfs-cache", + .voltype = "performance/io-cache", + .option = "statfs-cache", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "performance.statfs-cache-timeout", + .voltype = "performance/io-cache", + .option = "statfs-cache-timeout", + .op_version = 1, + .flags = OPT_FLAG_CLIENT_OPT + }, /* IO-threads xlator options */ { .key = "performance.io-thread-count", @@ -1461,6 +1494,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "thread-count", .op_version = 1 }, + { .key = "performance.io-thread-fops-per-thread-ratio", + .voltype = "performance/io-threads", + .option = "fops-per-thread-ratio", + .op_version = 1 + }, { .key = "performance.high-prio-threads", .voltype = "performance/io-threads", .op_version = 1 @@ -1555,6 +1593,18 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "performance.write-behind-trickling-writes", + .voltype = "performance/write-behind", + .option = "trickling-writes", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "performance.nfs.write-behind-trickling-writes", + .voltype = "performance/write-behind", + .option = "trickling-writes", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "performance.lazy-open", .voltype = "performance/open-behind", .option = "lazy-open", @@ -2403,8 +2453,8 @@ struct volopt_map_entry glusterd_volopt_map[] = { /* Cli options for Export authentication on nfs mount */ { .key = "nfs.exports-auth-enable", .voltype = "nfs/server", - .option = "nfs.exports-auth-enable", - .type = GLOBAL_DOC, + .option = "!nfs.*.exports-auth-enable", + //.type = GLOBAL_DOC, .op_version = GD_OP_VERSION_3_7_0 }, { .key = "nfs.auth-refresh-interval-sec", @@ -2500,6 +2550,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .voltype = "storage/posix", .op_version = GD_OP_VERSION_3_6_0, }, + { .key = "storage.min-free-disk", + .voltype = "storage/posix", + .op_version = 2, + }, + { .key = "storage.freespace-check-interval", + .voltype = "storage/posix", + .op_version = 2, + }, { .key = "storage.bd-aio", .voltype = "storage/bd", .op_version = 3 @@ -2515,6 +2573,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "!config", .op_version = 2 }, + { .key = "config.gfproxyd-remote-host", + .voltype = "configuration", + .option = "gfproxyd-remote-host", + .op_version = 2 + }, { .key = GLUSTERD_QUORUM_TYPE_KEY, .voltype = "mgmt/glusterd", .value = "off", @@ -2961,7 +3024,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { { .key = "cluster.locking-scheme", .voltype = "cluster/replicate", .type = DOC, - .op_version = GD_OP_VERSION_3_7_12, + .op_version = GD_OP_VERSION_3_7_12 , .flags = OPT_FLAG_CLIENT_OPT }, { .key = "cluster.granular-entry-heal", @@ -2970,6 +3033,72 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_8_0, .flags = OPT_FLAG_CLIENT_OPT }, + { .option = "revocation-secs", + .key = "features.locks-revocation-secs", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "revocation-clear-all", + .key = "features.locks-revocation-clear-all", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "revocation-max-blocked", + .key = "features.locks-revocation-max-blocked", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + }, + { .option = "monkey-unlocking", + .key = "features.locks-monkey-unlocking", + .voltype = "features/locks", + .op_version = GD_OP_VERSION_3_6_0, + .type = NO_DOC, + }, + { .key = "cluster.halo-enabled", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-hybrid-mode", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-failover-enabled", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-shd-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-nfsd-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-latency", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-replicas", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-min-replicas", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-min-samples", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = NULL } }; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 5bdf2ad0d4b..7c59d5501a9 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -222,6 +222,11 @@ struct glusterd_brickinfo { typedef struct glusterd_brickinfo glusterd_brickinfo_t; +struct glusterd_gfproxyd_info { + short port; + char *logfile; +}; + struct gf_defrag_brickinfo_ { char *name; int files; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 6c4cdfed062..598f62fee7a 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -186,6 +186,25 @@ start_glusterfs () fi #options with values start here + if [ -n "$halo_failover_enabled" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-failover-enabled=$halo_failover_enabled"); + fi + if [ -n "$halo_max_latency" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-latency=$halo_max_latency"); + fi + + if [ -n "$halo_max_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-replicas=$halo_max_replicas"); + fi + + if [ -n "$halo_min_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-min-replicas=$halo_min_replicas"); + fi + if [ -n "$log_level" ]; then cmd_line=$(echo "$cmd_line --log-level=$log_level"); fi @@ -479,6 +498,18 @@ with_options() [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts," fuse_mountopts="${fuse_mountopts}$key=\"$value\"" ;; + "halo-max-latency") + halo_max_latency=$value + ;; + "halo-max-replicas") + halo_max_replicas=$value + ;; + "halo-min-replicas") + halo_min_replicas=$value + ;; + "halo-failover-enabled") + halo_failover_enabled=$value + ;; x-*) # comments or userspace application-specific options, drop them ;; diff --git a/xlators/nfs/server/src/auth-cache.c b/xlators/nfs/server/src/auth-cache.c index 730e0a97d20..a607502c9de 100644 --- a/xlators/nfs/server/src/auth-cache.c +++ b/xlators/nfs/server/src/auth-cache.c @@ -17,47 +17,28 @@ #include "exports.h" #include "nfs-messages.h" -enum auth_cache_lookup_results { - ENTRY_FOUND = 0, - ENTRY_NOT_FOUND = -1, - ENTRY_EXPIRED = -2, -}; - -struct auth_cache_entry { - GF_REF_DECL; /* refcounting */ - data_t *data; /* data_unref() on refcount == 0 */ - - time_t timestamp; - struct export_item *item; -}; - /* Given a filehandle and an ip, creates a colon delimited hashkey. */ -static char* -make_hashkey(struct nfs3_fh *fh, const char *host) -{ - char *hashkey = NULL; - char exportid[256] = {0, }; - char gfid[256] = {0, }; - char mountid[256] = {0, }; - size_t nbytes = 0; - - gf_uuid_unparse (fh->exportid, exportid); - gf_uuid_unparse (fh->gfid, gfid); - gf_uuid_unparse (fh->mountid, mountid); - - nbytes = strlen (exportid) + strlen (host) - + strlen (mountid) + 3; - hashkey = GF_MALLOC (nbytes, gf_common_mt_char); - if (!hashkey) - return NULL; - - snprintf (hashkey, nbytes, "%s:%s:%s", exportid, - mountid, host); - - return hashkey; -} - +#define make_fh_hashkey(hashkey, fh, host) \ + do { \ + char exportid[256] = {0, }; \ + char mountid[256] = {0, }; \ + size_t nbytes = 0; \ + gf_uuid_unparse (fh->exportid, exportid); \ + gf_uuid_unparse (fh->mountid, mountid); \ + nbytes = strlen (exportid) + strlen (host) \ + + strlen (mountid) + 5; \ + hashkey = alloca (nbytes); \ + snprintf (hashkey, nbytes, "%s:%s:%s", exportid, \ + mountid, host); \ + } while (0); \ + +#define make_path_hashkey(hashkey, path, host) \ + do { \ + size_t nbytes = strlen (path) + strlen (host) + 2; \ + hashkey = alloca (nbytes); \ + snprintf (hashkey, nbytes, "%s:%s", path, host); \ + } while (0); /** * auth_cache_init -- Initialize an auth cache and set the ttl_sec * @@ -86,28 +67,11 @@ out: return cache; } -/* auth_cache_entry_free -- called by refcounting subsystem on refcount == 0 - * - * @to_free: auth_cache_entry that has refcount == 0 and needs to get free'd - */ -void -auth_cache_entry_free (void *to_free) -{ - struct auth_cache_entry *entry = to_free; - data_t *entry_data = NULL; - - GF_VALIDATE_OR_GOTO (GF_NFS, entry, out); - GF_VALIDATE_OR_GOTO (GF_NFS, entry->data, out); - - entry_data = entry->data; - /* set data_t->data to NULL, otherwise data_unref() tries to free it */ - entry_data->data = NULL; - data_unref (entry_data); - - GF_FREE (entry); -out: - return; -} +struct auth_cache_entry { + time_t timestamp; + struct export_item *item; + gf_boolean_t access_allowed; +}; /** * auth_cache_entry_init -- Initialize an auth cache entry @@ -124,303 +88,203 @@ auth_cache_entry_init () if (!entry) gf_msg (GF_NFS, GF_LOG_WARNING, ENOMEM, NFS_MSG_NO_MEMORY, "failed to allocate entry"); - else - GF_REF_INIT (entry, auth_cache_entry_free); return entry; } +// Internal lookup +enum _internal_cache_lookup_results { + ENTRY_NOT_FOUND = -1, + ENTRY_EXPIRED = -2, +}; + /** - * auth_cache_add -- Add an auth_cache_entry to the cache->dict + * auth_cache_purge -- Purge the dict in the cache and set + * the dict pointer to NULL. It will be allocated + * on the first insert into the dict. + * + * @cache: Cache to purge * - * @return: 0 on success, non-zero otherwise. */ -static int -auth_cache_add (struct auth_cache *cache, char *hashkey, - struct auth_cache_entry *entry) +void +auth_cache_purge (struct auth_cache *cache) { - int ret = -1; - data_t *entry_data = NULL; - - GF_VALIDATE_OR_GOTO (GF_NFS, cache, out); - GF_VALIDATE_OR_GOTO (GF_NFS, cache->cache_dict, out); - - ret = GF_REF_GET (entry); - if (ret == 0) { - /* entry does not have any references */ - ret = -1; - goto out; - } + dict_t *new_cache_dict = NULL; + dict_t *old_cache_dict = cache->cache_dict; - entry_data = bin_to_data (entry, sizeof (*entry)); - if (!entry_data) { - ret = -1; - GF_REF_PUT (entry); + if (!cache || !cache->cache_dict) goto out; - } - /* we'll take an extra ref on the data_t, it gets unref'd when the - * auth_cache_entry is released */ - entry->data = data_ref (entry_data); + (void)__sync_lock_test_and_set (&cache->cache_dict, new_cache_dict); - LOCK (&cache->lock); - { - ret = dict_set (cache->cache_dict, hashkey, entry_data); - } - UNLOCK (&cache->lock); - - if (ret) { - /* adding to dict failed */ - GF_REF_PUT (entry); - } + dict_destroy (old_cache_dict); out: - return ret; + return; } -/** - * _auth_cache_expired -- Check if the auth_cache_entry has expired - * - * The auth_cache->lock should have been taken when this function is called. - * - * @return: true when the auth_cache_entry is expired, false otherwise. - */ -static int -_auth_cache_expired (struct auth_cache *cache, struct auth_cache_entry *entry) -{ - return ((time (NULL) - entry->timestamp) > cache->ttl_sec); -} /** - * auth_cache_get -- Get the @hashkey entry from the cache->cache_dict - * - * @cache: The auth_cache that should contain the @entry. - * @haskkey: The key associated with the auth_cache_entry. - * @entry: The found auth_cache_entry, unmodified if not found/expired. - * - * The using the cache->dict requires locking, this function takes care of - * that. When the entry is found, but has expired, it will be removed from the - * cache_dict. - * - * @return: 0 when found, ENTRY_NOT_FOUND or ENTRY_EXPIRED otherwise. + * Lookup filehandle or path from the cache. */ -static enum auth_cache_lookup_results -auth_cache_get (struct auth_cache *cache, char *hashkey, - struct auth_cache_entry **entry) +int _cache_lookup (struct auth_cache *cache, char *key, + struct auth_cache_entry **entry) { - enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND; - data_t *entry_data = NULL; - struct auth_cache_entry *lookup_res = NULL; + int ret = ENTRY_NOT_FOUND; + struct auth_cache_entry *lookup_res; + data_t *entry_data; - GF_VALIDATE_OR_GOTO (GF_NFS, cache, out); - GF_VALIDATE_OR_GOTO (GF_NFS, cache->cache_dict, out); - GF_VALIDATE_OR_GOTO (GF_NFS, hashkey, out); - - LOCK (&cache->lock); - { - entry_data = dict_get (cache->cache_dict, hashkey); - if (!entry_data) - goto unlock; - - lookup_res = (struct auth_cache_entry *)(entry_data->data); - if (GF_REF_GET (lookup_res) == 0) { - /* entry has been free'd */ - ret = ENTRY_EXPIRED; - goto unlock; - } + if (!cache->cache_dict) { + goto out; + } - if (_auth_cache_expired (cache, lookup_res)) { - ret = ENTRY_EXPIRED; + if (!entry) { + goto out; + } - /* free entry and remove from the cache */ - GF_FREE (lookup_res); - entry_data->data = NULL; - dict_del (cache->cache_dict, hashkey); + *entry = NULL; - goto unlock; - } + entry_data = dict_get (cache->cache_dict, key); + if (!entry_data) { + goto out; + } - *entry = lookup_res; - ret = ENTRY_FOUND; + lookup_res = (struct auth_cache_entry *)(entry_data->data); + if (time (NULL) - lookup_res->timestamp > cache->ttl_sec) { + GF_FREE (lookup_res); + entry_data->data = NULL; + dict_del (cache->cache_dict, key); // Remove from the cache + ret = ENTRY_EXPIRED; + goto out; } -unlock: - UNLOCK (&cache->lock); + + *entry = lookup_res; + + return 0; out: - return ret; + return -1; } /** - * auth_cache_lookup -- Lookup an item from the cache - * - * @cache: cache to lookup from - * @fh : FH to use in lookup - * @host_addr: Address to use in lookup - * @timestamp: The timestamp to set when lookup succeeds - * @can_write: Is the host authorized to write to the filehandle? - * - * If the current time - entry time of the cache entry > ttl_sec, - * we remove the element from the dict and return ENTRY_EXPIRED. - * - * @return: ENTRY_EXPIRED if entry expired - * ENTRY_NOT_FOUND if entry not found in dict - * 0 if found + * Lookup filehandle from the cache. */ -enum auth_cache_lookup_results -auth_cache_lookup (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr, time_t *timestamp, - gf_boolean_t *can_write) +int +_cache_lookup_fh (struct auth_cache *cache, struct nfs3_fh *fh, + const char *host_addr, struct auth_cache_entry **ec) { - char *hashkey = NULL; - struct auth_cache_entry *lookup_res = NULL; - enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND; - - GF_VALIDATE_OR_GOTO (GF_NFS, cache, out); - GF_VALIDATE_OR_GOTO (GF_NFS, fh, out); - GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out); - GF_VALIDATE_OR_GOTO (GF_NFS, timestamp, out); - GF_VALIDATE_OR_GOTO (GF_NFS, can_write, out); - - hashkey = make_hashkey (fh, host_addr); - if (!hashkey) { - ret = -ENOMEM; - goto out; + char *hashkey; + int ret = ENTRY_NOT_FOUND; + if (fh && host_addr) { + make_fh_hashkey (hashkey, fh, host_addr); + ret =_cache_lookup (cache, hashkey, ec); } - - ret = auth_cache_get (cache, hashkey, &lookup_res); - switch (ret) { - case ENTRY_FOUND: - *timestamp = lookup_res->timestamp; - *can_write = lookup_res->item->opts->rw; - GF_REF_PUT (lookup_res); - break; - - case ENTRY_NOT_FOUND: - gf_msg_debug (GF_NFS, 0, "could not find entry for %s", - host_addr); - break; - - case ENTRY_EXPIRED: - gf_msg_debug (GF_NFS, 0, "entry for host %s has expired", - host_addr); - break; - } - -out: - GF_FREE (hashkey); - return ret; } -/* auth_cache_entry_purge -- free up the auth_cache_entry - * - * This gets called through dict_foreach() by auth_cache_purge(). Each - * auth_cache_entry has a refcount which needs to be decremented. Once the - * auth_cache_entry reaches refcount == 0, auth_cache_entry_free() will call - * data_unref() to free the associated data_t. - * - * @d: dict that gets purged by auth_cache_purge() - * @k: hashkey of the current entry - * @v: data_t of the current entry +/** + * Lookup path from the cache. */ int -auth_cache_entry_purge (dict_t *d, char *k, data_t *v, void *_unused) +_cache_lookup_path (struct auth_cache *cache, const char *path, + const char *host_addr, struct auth_cache_entry **ec) { - struct auth_cache_entry *entry = (struct auth_cache_entry *) v->data; - - if (entry) - GF_REF_PUT (entry); - - return 0; + char *hashkey; + int ret = ENTRY_NOT_FOUND; + if (path && host_addr) { + make_path_hashkey (hashkey, path, host_addr); + ret = _cache_lookup (cache, hashkey, ec); + } + return ret; } /** - * auth_cache_purge -- Purge the dict in the cache and create a new empty one. - * - * @cache: Cache to purge - * + * cache_item -- Caches either a filehandle or path. + * See descriptions of functions that invoke this one. */ -void -auth_cache_purge (struct auth_cache *cache) +int +cache_item (struct auth_cache *cache, const char *path, struct nfs3_fh *fh, + const char *host_addr, struct export_item *export_item, + auth_cache_status_t status) { - dict_t *new_cache_dict = dict_new (); - dict_t *old_cache_dict = NULL; + int ret = -EINVAL; + data_t *entry_data = NULL; + struct auth_cache_entry *entry = NULL; + char *hashkey = NULL; - if (!cache || !new_cache_dict) + GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out); + GF_VALIDATE_OR_GOTO (GF_NFS, cache, out); + + // We can cache either a file-handle or a path, not both, + // and at least one of them must be defined! + if ((fh && path) || (!fh && !path)) { goto out; + } - LOCK (&cache->lock); - { - old_cache_dict = cache->cache_dict; - cache->cache_dict = new_cache_dict; + // If a dict has not been allocated already, allocate it. + if (!cache->cache_dict) { + cache->cache_dict = dict_new (); + if (!cache->cache_dict) { + ret = -ENOMEM; + goto out; + } } - UNLOCK (&cache->lock); - /* walk all entries and refcount-- with GF_REF_PUT() */ - dict_foreach (old_cache_dict, auth_cache_entry_purge, NULL); - dict_unref (old_cache_dict); -out: - return; -} -/** - * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given - * host - * @cache: The fh cache - * @host_addr: Address to use in lookup - * @fh: The fh to use in lookup - * - * - * @return: TRUE if cached, FALSE otherwise - * - */ -gf_boolean_t -is_nfs_fh_cached (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr) -{ - int ret = 0; - time_t timestamp = 0; - gf_boolean_t cached = _gf_false; - gf_boolean_t can_write = _gf_false; + // Find an entry with the filehandle or path, depending + // on which one is defined. Validation for these parameters + // is above. + if (fh) { + ret = _cache_lookup_fh (cache, fh, host_addr, &entry); + make_fh_hashkey (hashkey, fh, host_addr) + } - if (!fh) - goto out; + if (path) { + ret = _cache_lookup_path (cache, path, host_addr, &entry); + make_path_hashkey (hashkey, path, host_addr) + } + + // If no entry was found, we need to create one. + if (!entry) { + entry = auth_cache_entry_init (); + GF_CHECK_ALLOC (entry, ret, out); + } - ret = auth_cache_lookup (cache, fh, host_addr, ×tamp, &can_write); - cached = (ret == ENTRY_FOUND); + // Populate the entry + entry->timestamp = time (NULL); + entry->item = export_item; + // Access is only allowed if the status is set to + // AUTH_CACHE_HOST_AUTH_OK + entry->access_allowed = (status == AUTH_CACHE_HOST_AUTH_OK); + // Put the entry into the cache + entry_data = bin_to_data (entry, sizeof (*entry)); + dict_set (cache->cache_dict, hashkey, entry_data); + gf_log (GF_NFS, GF_LOG_TRACE, "Caching %s for host(%s) as %s", + path ? path : "fh", host_addr, entry->access_allowed ? + "ALLOWED" : "NOT ALLOWED"); out: - return cached; + return ret; } - /** - * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given - * host and writable - * @cache: The fh cache - * @host_addr: Address to use in lookup - * @fh: The fh to use in lookup - * + * cache_nfs_path -- Places the path in the underlying dict as we are + * using as our cache. The value is an entry struct + * containing the export item that was authorized or + * deauthorized for the operation and the path authorized + * or deauthorized. * - * @return: TRUE if cached & writable, FALSE otherwise + * @cache: The cache to place fh's in + * @path : The path to cache + * @host_addr: The address of the host + * @export_item: The export item that was authorized/deauthorized * */ -gf_boolean_t -is_nfs_fh_cached_and_writeable (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr) +int +cache_nfs_path (struct auth_cache *cache, const char *path, + const char *host_addr, struct export_item *export_item, + auth_cache_status_t status) { - int ret = 0; - time_t timestamp = 0; - gf_boolean_t cached = _gf_false; - gf_boolean_t writable = _gf_false; - - if (!fh) - goto out; - - ret = auth_cache_lookup (cache, fh, host_addr, ×tamp, &writable); - cached = ((ret == ENTRY_FOUND) && writable); - -out: - return cached; + return cache_item (cache, path, NULL, host_addr, export_item, status); } /** @@ -438,52 +302,68 @@ out: */ int cache_nfs_fh (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr, struct export_item *export_item) + const char *host_addr, struct export_item *export_item, + auth_cache_status_t status) { - int ret = -EINVAL; - char *hashkey = NULL; - data_t *entry_data = NULL; - time_t timestamp = 0; - gf_boolean_t can_write = _gf_false; - struct auth_cache_entry *entry = NULL; + return cache_item (cache, NULL, fh, host_addr, export_item, status); +} - GF_VALIDATE_OR_GOTO (GF_NFS, host_addr, out); - GF_VALIDATE_OR_GOTO (GF_NFS, cache, out); - GF_VALIDATE_OR_GOTO (GF_NFS, fh, out); +auth_cache_status_t +auth_cache_allows (struct auth_cache *cache, struct nfs3_fh *fh, + const char *path, const char *host_addr, + gf_boolean_t check_rw_access) +{ + int ret = 0; + int status = AUTH_CACHE_HOST_EACCES; + gf_boolean_t cache_allows = FALSE; + struct auth_cache_entry *ace = NULL; - /* If we could already find it in the cache, just return */ - ret = auth_cache_lookup (cache, fh, host_addr, ×tamp, &can_write); - if (ret == 0) { - gf_msg_trace (GF_NFS, 0, "found cached auth/fh for host " - "%s", host_addr); + if ((fh && path) || (!fh && !path)) { + status = AUTH_CACHE_HOST_ENOENT; goto out; } - hashkey = make_hashkey (fh, host_addr); - if (!hashkey) { - ret = -ENOMEM; - goto out; + if (fh) { + ret = _cache_lookup_fh (cache, fh, host_addr, &ace); } - entry = auth_cache_entry_init (); - if (!entry) { - ret = -ENOMEM; - goto out; + if (path) { + ret = _cache_lookup_path (cache, path, host_addr, &ace); } - entry->timestamp = time (NULL); - entry->item = export_item; - - ret = auth_cache_add (cache, hashkey, entry); - GF_REF_PUT (entry); - if (ret) - goto out; + cache_allows = (ret == 0) && ace->access_allowed; + if (check_rw_access) { + cache_allows = cache_allows && ace->item->opts->rw; + } - gf_msg_trace (GF_NFS, 0, "Caching file-handle (%s)", host_addr); - ret = 0; + if (!ace) { + status = AUTH_CACHE_HOST_ENOENT; + } + if (cache_allows) { + status = AUTH_CACHE_HOST_AUTH_OK; + } out: - GF_FREE (hashkey); + return status; +} - return ret; +auth_cache_status_t +auth_cache_allows_fh (struct auth_cache *cache, struct nfs3_fh *fh, + const char *host_addr) +{ + return auth_cache_allows (cache, fh, NULL, host_addr, FALSE); +} + +auth_cache_status_t +auth_cache_allows_write_to_fh (struct auth_cache *cache, struct nfs3_fh *fh, + const char *host_addr) +{ + return auth_cache_allows (cache, fh, NULL, host_addr, TRUE); +} + +auth_cache_status_t +auth_cache_allows_path (struct auth_cache *cache, const char *path, + const char *host_addr) +{ + return auth_cache_allows (cache, NULL, path, host_addr, FALSE); } diff --git a/xlators/nfs/server/src/auth-cache.h b/xlators/nfs/server/src/auth-cache.h index a3ea5a43ded..de7db6b5545 100644 --- a/xlators/nfs/server/src/auth-cache.h +++ b/xlators/nfs/server/src/auth-cache.h @@ -27,6 +27,11 @@ struct auth_cache { time_t ttl_sec; /* TTL of the auth cache in seconds */ }; +typedef enum { + AUTH_CACHE_HOST_ENOENT = -1, /* Host not found in cache */ + AUTH_CACHE_HOST_EACCES = -2, /* Host explicitly de-authed */ + AUTH_CACHE_HOST_AUTH_OK = 0, /* Host is fully authed */ +} auth_cache_status_t; /* Initializes the cache */ struct auth_cache * @@ -35,17 +40,29 @@ auth_cache_init (time_t ttl_sec); /* Inserts FH into cache */ int cache_nfs_fh (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr, struct export_item *export_item); + const char *host_addr, struct export_item *export_item, + auth_cache_status_t status); + +/* Inserts path into cache */ +int +cache_nfs_path (struct auth_cache *cache, const char *path, + const char *host_addr, struct export_item *export_item, + auth_cache_status_t status); /* Checks if the filehandle cached & writable */ -gf_boolean_t -is_nfs_fh_cached_and_writeable (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr); +auth_cache_status_t +auth_cache_allows_write_to_fh (struct auth_cache *cache, struct nfs3_fh *fh, + const char *host_addr); /* Checks if the filehandle is cached */ -gf_boolean_t -is_nfs_fh_cached (struct auth_cache *cache, struct nfs3_fh *fh, - const char *host_addr); +auth_cache_status_t +auth_cache_allows_fh (struct auth_cache *cache, struct nfs3_fh *fh, + const char *host_addr); + +/* Checks if the path is cached */ +auth_cache_status_t +auth_cache_allows_path (struct auth_cache *cache, const char *path, + const char *host_addr); /* Purge the cache */ void diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h index bc9af2f0b8b..a4e15d3f7ef 100644 --- a/xlators/nfs/server/src/exports.h +++ b/xlators/nfs/server/src/exports.h @@ -22,7 +22,7 @@ #define GF_EXP GF_NFS"-exports" #define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())" -#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)" +#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)" #define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)" #define NETGROUP_MAX_LEN 128 @@ -51,23 +51,28 @@ struct export_options { char *anon_uid; /* anonuid option */ char *sec_type; /* X, for sec=X */ }; +typedef struct export_options export_options_t; + struct export_item { - char *name; /* Name of the export item */ - struct export_options *opts; /* NFS Options */ + char *name; /* Name of the export item */ + export_options_t *opts; /* NFS Options */ }; +typedef struct export_item export_item_t; struct export_dir { char *dir_name; /* Directory */ dict_t *netgroups; /* Dict of netgroups */ dict_t *hosts; /* Dict of hosts */ }; +typedef struct export_dir export_dir_t; struct exports_file { char *filename; /* Filename */ dict_t *exports_dict; /* Dict of export_dir_t */ dict_t *exports_map; /* Map of SuperFastHash(<export>) -> expdir */ }; +typedef struct exports_file exports_file_t; void exp_file_deinit (struct exports_file *expfile); diff --git a/xlators/nfs/server/src/mount3-auth.c b/xlators/nfs/server/src/mount3-auth.c index 97c95cbfd23..831d92edbef 100644 --- a/xlators/nfs/server/src/mount3-auth.c +++ b/xlators/nfs/server/src/mount3-auth.c @@ -429,6 +429,15 @@ __export_dir_lookup_netgroup (dict_t *dict, char *key, data_t *val, GF_ASSERT ((*key == '@')); + /** + * If at any point in time as we search through the dictionaries, + * if we were marked as "Found", we should exit out immediately + * and not set anything else in this struct. + */ + if (ngsa->found) { + goto out; + } + /* We use ++key here because keys start with '@' for ngs */ ngentry = ng_file_get_netgroup (nfile, (key + 1)); if (!ngentry) { @@ -452,10 +461,6 @@ __export_dir_lookup_netgroup (dict_t *dict, char *key, data_t *val, ngsa); } - /* If the above search was successful, just return */ - if (ngsa->found) - goto out; - /* Run through the netgroups dict */ if (ngentry->netgroup_ngs) { ngsa->_is_host_dict = _gf_false; diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c index b7350385c32..1cc0b07a9a6 100644 --- a/xlators/nfs/server/src/mount3.c +++ b/xlators/nfs/server/src/mount3.c @@ -24,6 +24,7 @@ #include "iatt.h" #include "nfs-mem-types.h" #include "nfs.h" +#include "nfs3.h" #include "common-utils.h" #include "store.h" #include "glfs-internal.h" @@ -36,6 +37,7 @@ #include <sys/socket.h> #include <sys/uio.h> +#define SUPPORT_RMTAB 0 /* This macro will assist in freeing up entire link list * of host_auth_spec structure. @@ -444,7 +446,7 @@ mount_open_rmtab (const char *rmtab, gf_store_handle_t **sh) return _gf_true; } - +#if SUPPORT_RMTAB /* Read the rmtab into a clean ms->mountlist. */ static void @@ -472,6 +474,7 @@ mount_read_rmtab (struct mount3_state *ms) out: gf_store_handle_destroy (sh); } +#endif /* Write the ms->mountlist to the rmtab. * @@ -597,7 +600,9 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req, nfs = (struct nfs_state *)ms->nfsx->private; +#if SUPPORT_RMTAB update_rmtab = mount_open_rmtab (nfs->rmtab, &sh); +#endif strncpy (me->exname, expname, MNTPATHLEN); /* Sometimes we don't care about the full path @@ -696,6 +701,9 @@ __mnt3_build_mountid_from_path (const char *path, uuid_t mountid) uint32_t hashed_path = 0; int ret = -1; + if (!path) + goto out; + while (strlen (path) > 0 && path[0] == '/') path++; @@ -791,7 +799,9 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie, } snprintf (path, PATH_MAX, "/%s", mntxl->name); +#if SUPPORT_RMTAB mnt3svc_update_mountlist (ms, req, path, NULL); +#endif GF_FREE (path); if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) { fh = nfs3_fh_build_indexed_root_fh (ms->nfsx->children, mntxl); @@ -1163,7 +1173,8 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mres->exp->expname, mres->resolveloc.path); /* Check if this path is authorized to be mounted */ - authcode = mnt3_authenticate_request (ms, mres->req, NULL, NULL, + authcode = mnt3_authenticate_request (ms, mres->req, NULL, + mres->exp->vol->name, mres->exp->fullpath, &authorized_path, &authorized_host, @@ -1185,6 +1196,9 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* Build mountid from the authorized path and stick it in the * filehandle that will get passed back to the client */ + if (!authorized_path) { + goto err; + } __mnt3_build_mountid_from_path (authorized_path, fh.mountid); snprintf (path, PATH_MAX, "/%s%s", mres->exp->vol->name, @@ -1741,7 +1755,7 @@ mnt3_check_client_net_udp (struct svc_req *req, char *volname, xlator_t *nfsx) if ((!req) || (!volname) || (!nfsx)) goto err; - sin = svc_getcaller (req->rq_xprt); + sin = (struct sockaddr_in *)svc_getcaller (req->rq_xprt); if (!sin) goto err; @@ -1896,7 +1910,7 @@ _mnt3_get_host_from_peer (const char *peer_addr) size_t host_len = 0; char *colon = NULL; - colon = strchr (peer_addr, ':'); + colon = strrchr (peer_addr, ':'); if (!colon) { gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER, "Bad peer %s", peer_addr); @@ -1925,9 +1939,23 @@ mnt3_check_cached_fh (struct mount3_state *ms, struct nfs3_fh *fh, const char *host_addr, gf_boolean_t is_write_op) { if (!is_write_op) - return is_nfs_fh_cached (ms->authcache, fh, host_addr); + return auth_cache_allows_fh (ms->authcache, fh, host_addr); + + return auth_cache_allows_write_to_fh (ms->authcache, fh, host_addr); +} - return is_nfs_fh_cached_and_writeable (ms->authcache, fh, host_addr); +/** + * mnt3_check_cached_path -- Check if path is cached. + * + * Calls auxiliary functions based on whether we are checking + * a write operation. + * + */ +int +mnt3_check_cached_path (struct mount3_state *ms, const char *path, + const char *host_addr, gf_boolean_t is_write_op) +{ + return auth_cache_allows_path (ms->authcache, path, host_addr); } /** @@ -1961,7 +1989,7 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req, char *pathdup = NULL; size_t dlen = 0; char *auth_host = NULL; - gf_boolean_t fh_cached = _gf_false; + auth_cache_status_t auth_cache_status = AUTH_CACHE_HOST_ENOENT; struct export_item *expitem = NULL; GF_VALIDATE_OR_GOTO (GF_MNT, ms, out); @@ -1982,12 +2010,24 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req, } /* Check if the filehandle is cached */ - fh_cached = mnt3_check_cached_fh (ms, fh, host_addr_ip, is_write_op); - if (fh_cached) { - gf_msg_trace (GF_MNT, 0, "Found cached FH for %s", - host_addr_ip); + auth_cache_status = fh ? mnt3_check_cached_fh (ms, fh, host_addr_ip, + is_write_op) : + mnt3_check_cached_path (ms, path, host_addr_ip, + is_write_op); + + if (auth_cache_status == AUTH_CACHE_HOST_AUTH_OK) { + gf_log (GF_MNT, GF_LOG_TRACE, "Found authorized cached " + "FH for [%s]!", host_addr_ip); auth_status_code = 0; goto free_and_out; + } else if (auth_cache_status == AUTH_CACHE_HOST_EACCES) { + gf_log (GF_MNT, GF_LOG_TRACE, "Found de-authorized cached " + "FH for [%s]!", host_addr_ip); + auth_status_code = -EACCES; + goto free_and_out; + } else { + gf_log (GF_MNT, GF_LOG_TRACE, "Cached FH not found for [%s]!", + host_addr_ip); } /* Check if the IP is authorized */ @@ -2018,10 +2058,20 @@ _mnt3_authenticate_req (struct mount3_state *ms, rpcsvc_request_t *req, * host if they are null. */ if (!authorized_export || !authorized_host) { - /* Cache the file handle if it was authorized */ - if (fh && auth_status_code == 0) - cache_nfs_fh (ms->authcache, fh, host_addr_ip, expitem); + if (auth_status_code == 0) { + auth_cache_status = AUTH_CACHE_HOST_AUTH_OK; + } else { + auth_cache_status = AUTH_CACHE_HOST_EACCES; + } + if (fh) { + cache_nfs_fh (ms->authcache, fh, host_addr_ip, + expitem, auth_cache_status); + } + if (path) { + cache_nfs_path (ms->authcache, path, host_addr_ip, + expitem, auth_cache_status); + } goto free_and_out; } @@ -2080,15 +2130,18 @@ mnt3_authenticate_request (struct mount3_state *ms, rpcsvc_request_t *req, const char *path, char **authorized_path, char **authorized_host, gf_boolean_t is_write_op) { - int auth_status_code = -EACCES; - char *parent_path = NULL; - const char *parent_old = NULL; + int auth_status_code = -EACCES; + char *parent_path = NULL; + const char *parent_old = NULL; + struct mnt3_export *exp = NULL; + struct nfs3_state *nfs3 = ms->nfs->nfs3state; GF_VALIDATE_OR_GOTO (GF_MNT, ms, out); GF_VALIDATE_OR_GOTO (GF_MNT, req, out); + GF_VALIDATE_OR_GOTO (GF_MNT, volname, out); /* If this option is not set, just allow it through */ - if (!ms->nfs->exports_auth) { + if (!nfs3->exports_auth || !nfs3_is_exports_auth(nfs3, volname)) { /* This function is called in a variety of use-cases (mount * + each fop) so path/authorized_path are not always present. * For the cases which it _is_ present we need to populate the @@ -2213,8 +2266,8 @@ mnt3svc_mnt (rpcsvc_request_t *req) /* The second authentication check is the exports/netgroups * check. */ - authcode = mnt3_authenticate_request (ms, req, NULL, NULL, path, NULL, - NULL, _gf_false); + authcode = mnt3_authenticate_request (ms, req, NULL, exp->vol->name, + path, NULL, NULL, FALSE); if (authcode != 0) { mntstat = MNT3ERR_ACCES; gf_msg_debug (GF_MNT, 0, "Client mount not allowed"); @@ -2265,9 +2318,10 @@ __build_mountlist (struct mount3_state *ms, int *count) if ((!ms) || (!count)) return NULL; +#if SUPPORT_RMTAB /* read rmtab, other peers might have updated it */ mount_read_rmtab(ms); - +#endif *count = 0; gf_msg_debug (GF_MNT, 0, "Building mount list:"); list_for_each_entry (me, &ms->mountlist, mlist) { @@ -2399,7 +2453,9 @@ mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname) nfs = (struct nfs_state *)ms->nfsx->private; +#if SUPPORT_RMTAB update_rmtab = mount_open_rmtab (nfs->rmtab, &sh); +#endif if (update_rmtab) { ret = gf_store_lock (sh); if (ret) @@ -2818,7 +2874,8 @@ __mnt3udp_get_export_subdir_inode (struct svc_req *req, char *subdir, /* AUTH check for subdir i.e. nfs.export-dir */ if (exp->hostspec) { - struct sockaddr_in *sin = svc_getcaller (req->rq_xprt); + struct sockaddr_in *sin; + sin = (struct sockaddr_in *)svc_getcaller (req->rq_xprt); ret = mnt3_verify_auth (sin, exp); if (ret) { gf_msg (GF_MNT, GF_LOG_ERROR, EACCES, @@ -3026,7 +3083,9 @@ mount3udp_add_mountlist (xlator_t *nfsx, char *host, char *export) LOCK (&ms->mountlock); { list_add_tail (&me->mlist, &ms->mountlist); +#if SUPPORT_RMTAB mount_rewrite_rmtab(ms, NULL); +#endif } UNLOCK (&ms->mountlock); return 0; @@ -3714,6 +3773,9 @@ __mnt3_mounted_exports_walk (dict_t *dict, char *key, data_t *val, void *tmp) * and umounts them. * * @ms: The mountstate for this service that holds all the information we need + if (!nfs->nfs3state) + return NULL; + * */ void @@ -3800,6 +3862,9 @@ _mnt3_auth_param_refresh_thread (void *argv) /* Sleep before checking the file again */ sleep (mstate->nfs->auth_refresh_time_secs); + if (!mstate->nfs->nfs3state->exports_auth) + continue; + if (_mnt3_has_file_changed (exp_file_path, &exp_time)) { gf_msg (GF_MNT, GF_LOG_INFO, 0, NFS_MSG_UPDATING_EXP, "File %s changed, updating exports,", @@ -3978,7 +4043,7 @@ mnt3svc_init (xlator_t *nfsx) goto err; } - if (nfs->exports_auth) { + if (nfs->nfs3state->exports_auth) { ret = _mnt3_init_auth_params (mstate); if (ret < 0) goto err; @@ -4127,6 +4192,15 @@ mnt1svc_init (xlator_t *nfsx) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (options, "transport.address-family", "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, + "dict_set_str error when trying to enable ipv6"); + goto err; + } +#endif + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, errno, diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c index e8e226e953e..536a45ede3d 100644 --- a/xlators/nfs/server/src/mount3udp_svc.c +++ b/xlators/nfs/server/src/mount3udp_svc.c @@ -133,7 +133,15 @@ mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp) mountres3 *res = NULL; struct sockaddr_in *sin = NULL; - sin = svc_getcaller (transp); + sin = (struct sockaddr_in *)svc_getcaller (transp); + /* svc_getcaller returns a pointer to a sockaddr_in6, even though it + * might actually be an IPv4 address. It ought return a struct sockaddr + * and make the caller upcast it to the proper address family. Sigh. + * + * Let's make sure that it's actually an IPv4 address. + */ + GF_ASSERT (sin->sin_family == AF_INET); + inet_ntop (AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN+1); switch (rqstp->rq_proc) { diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c index 526918872d7..fca38ba6b87 100644 --- a/xlators/nfs/server/src/nfs-common.c +++ b/xlators/nfs/server/src/nfs-common.c @@ -146,8 +146,12 @@ nfs_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path) gf_uuid_copy (loc->gfid, inode->gfid); } - if (parent) + if (parent) { loc->parent = inode_ref (parent); + if (!gf_uuid_is_null (parent->gfid)) { + gf_uuid_copy (loc->pargfid, parent->gfid); + } + } if (path) { loc->path = gf_strdup (path); diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c index ddfa89dab11..e94cb03b771 100644 --- a/xlators/nfs/server/src/nfs.c +++ b/xlators/nfs/server/src/nfs.c @@ -33,6 +33,7 @@ #include "syscall.h" #include "rpcsvc.h" #include "nfs-messages.h" +#include "syncop.h" #define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids" #define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout" @@ -204,6 +205,10 @@ nfs_program_register_portmap_all (struct nfs_state *nfs) if (nfs->override_portnum) prog->progport = nfs->override_portnum; (void) rpcsvc_program_register_portmap (prog, prog->progport); +#ifdef IPV6_DEFAULT + (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport, + TRUE); +#endif } return (0); @@ -285,6 +290,55 @@ nfs_deinit_versions (struct list_head *versions, xlator_t *this) return 0; } +void rpcbind_register_prog (rpcsvc_program_t *prog) +{ + if (!prog) { + return; + } + + /* + * Attempt to register the program with rpcbind. In 99.9% of cases, + * This call will most likely *always* fail, since the program should already + * be registered. We don't care if this call fails since it is best effort. + */ + rpcsvc_program_register_portmap (prog, prog->progport); +#ifdef IPV6_DEFAULT + rpcsvc_program_register_rpcbind6 (prog, prog->progport, FALSE); +#endif +} + +/** + * rpcbind_autoregister_task + * + * The purpose of this task is to attempt to ensure that NFS stays + * registered with rpcbind. The thread is "best effort", and as a + * result we do not care what the result of the call is. + */ +int rpcbind_autoregister_task (void *arg) +{ + struct nfs_state *nfs = arg; + struct nfs_initer_list *version = NULL; + struct nfs_initer_list *tmp = NULL; + rpcsvc_program_t *prog = NULL; + struct list_head *versions = &nfs->versions; + + list_for_each_entry_safe (version, tmp, versions, list) { + rpcbind_register_prog (version->program); + } + + return 0; +} + +void *nfs_janitor (void *arg) +{ + struct nfs_state *nfs = arg; + while (_gf_true) { + synctask_new (nfs->this->ctx->env, rpcbind_autoregister_task, + NULL, NULL, nfs); + sleep (10); + } +} + int nfs_init_versions (struct nfs_state *nfs, xlator_t *this) { @@ -339,6 +393,18 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this) if (version->required) goto err; } +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_register_rpcbind6 (prog, + prog->progport, + TRUE); + if (ret == -1) { + gf_msg (GF_NFS, GF_LOG_ERROR, 0, + NFS_MSG_PGM_REG_FAIL, + "Program (ipv6) %s registration failed", + prog->progname); + goto err; + } +#endif } } @@ -348,6 +414,18 @@ err: return ret; } +int +nfs_janitor_init (struct nfs_state *nfs) +{ + int ret = pthread_create (&nfs->janitor_thread, NULL, nfs_janitor, nfs); + if (ret != 0) { + gf_log (GF_NFS, GF_LOG_WARNING, + "Unable to start rpcbind register thread! Error=%s", + strerror (ret)); + return -1; + } + return 0; +} int nfs_add_all_initiators (struct nfs_state *nfs) @@ -355,24 +433,24 @@ nfs_add_all_initiators (struct nfs_state *nfs) int ret = 0; /* Add the initializers for all versions. */ - ret = nfs_add_initer (&nfs->versions, mnt3svc_init, _gf_true); + ret = nfs_add_initer (&nfs->versions, mnt1svc_init, _gf_true); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL, - "Failed to add MOUNT3 protocol initializer"); + "Failed to add MOUNT1 protocol initializer"); goto ret; } - ret = nfs_add_initer (&nfs->versions, mnt1svc_init, _gf_true); + ret = nfs_add_initer (&nfs->versions, nfs3svc_init, _gf_true); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL, - "Failed to add MOUNT1 protocol initializer"); + "Failed to add NFS3 protocol initializer"); goto ret; } - ret = nfs_add_initer (&nfs->versions, nfs3svc_init, _gf_true); + ret = nfs_add_initer (&nfs->versions, mnt3svc_init, _gf_true); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL, - "Failed to add NFS3 protocol initializer"); + "Failed to add MOUNT3 protocol initializer"); goto ret; } @@ -759,6 +837,8 @@ nfs_init_state (xlator_t *this) return NULL; } + nfs->this = this; + nfs->memfactor = GF_NFS_DEFAULT_MEMFACTOR; if (dict_get (this->options, "nfs.mem-factor")) { ret = dict_get_str (this->options, "nfs.mem-factor", @@ -901,6 +981,16 @@ nfs_init_state (xlator_t *this) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (this->options, "transport.address-family", + "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error"); + goto free_foppool; + } +#endif + + /* Right only socket support exists between nfs client and * gluster nfs, so we can set default value as socket */ @@ -933,24 +1023,22 @@ nfs_init_state (xlator_t *this) } nfs->exports_auth = GF_NFS_DEFAULT_EXPORT_AUTH; - if (dict_get(this->options, "nfs.exports-auth-enable")) { + if (dict_get (this->options, "nfs.exports-auth-enable")) { ret = dict_get_str (this->options, "nfs.exports-auth-enable", &optstr); if (ret == -1) { - gf_msg (GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL, - "Failed to parse dict"); + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict"); goto free_foppool; } ret = gf_string2boolean (optstr, &boolt); if (ret < 0) { - gf_msg (GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL, - "Failed to parse bool string"); + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse bool " + "string"); goto free_foppool; } - if (boolt == _gf_true) - nfs->exports_auth = 1; + nfs->exports_auth = boolt; } nfs->auth_refresh_time_secs = GF_NFS_DEFAULT_AUTH_REFRESH_INTERVAL_SEC; @@ -1190,6 +1278,7 @@ nfs_reconfigure_state (xlator_t *this, dict_t *options) "nfs.transport-type", "nfs.mem-factor", NULL}; + char *exports_auth_enable = NULL; GF_VALIDATE_OR_GOTO (GF_NFS, this, out); GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out); @@ -1269,6 +1358,21 @@ nfs_reconfigure_state (xlator_t *this, dict_t *options) "Reconfigured nfs.mount-rmtab path: %s", nfs->rmtab); } + /* reconfig nfs.exports-auth-enable */ + if (dict_get (options, "nfs.exports-auth-enable")) { + ret = dict_get_str (options, "nfs.exports-auth-enable", + &exports_auth_enable); + if (ret < 0) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read " + "reconfigured option: nfs.exports-auth-enable"); + goto out; + } + ret = gf_string2int (exports_auth_enable, &nfs->exports_auth); + if (ret < 0) { + goto out; + } + } + GF_OPTION_RECONF (OPT_SERVER_AUX_GIDS, optbool, options, bool, out); if (nfs->server_aux_gids != optbool) { @@ -1520,6 +1624,13 @@ init (xlator_t *this) { return (-1); } + ret = nfs_janitor_init (nfs); + if (ret) { + gf_msg (GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_FAIL, + "Failed to initialize janitor"); + return (-1); + } + gf_msg (GF_NFS, GF_LOG_INFO, 0, NFS_MSG_STARTED, "NFS service started"); return (0); /* SUCCESS */ @@ -2019,7 +2130,7 @@ struct volume_options options[] = { }, { .key = {"nfs.mount-rmtab"}, .type = GF_OPTION_TYPE_PATH, - .default_value = NFS_DATADIR "/rmtab", + .default_value = "/-", .description = "Set the location of the cache file that is used to " "list all the NFS-clients that have connected " "through the MOUNT protocol. If this is on shared " @@ -2075,7 +2186,7 @@ struct volume_options options[] = { .description = "Sets the number of non-idempotent " "requests to cache in drc" }, - { .key = {"nfs.exports-auth-enable"}, + { .key = {"nfs.*.exports-auth-enable"}, .type = GF_OPTION_TYPE_BOOL, .description = "Set the option to 'on' to enable exports/netgroup " "authentication in the NFS server and mount daemon." diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h index 9bcc88f5548..4f5faf29f6b 100644 --- a/xlators/nfs/server/src/nfs.h +++ b/xlators/nfs/server/src/nfs.h @@ -96,6 +96,8 @@ struct nfs_state { uint32_t server_aux_gids_max_age; gid_cache_t gid_cache; uint32_t generation; + pthread_t janitor_thread; + xlator_t *this; gf_boolean_t register_portmap; char *rpc_statd; char *rpc_statd_pid_file; diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c index 0b977092fbb..64bd08a3fc7 100644 --- a/xlators/nfs/server/src/nfs3-helpers.c +++ b/xlators/nfs/server/src/nfs3-helpers.c @@ -239,7 +239,12 @@ nfs3_errno_to_nfsstat3 (int errnum) break; case ENOTCONN: - stat = NFS3ERR_IO; + /* If connections to bricks cannot be established, + * the filesystem is effectively in read-only mode + * to protect data. E.g., when all bricks in a subvolume + * crash. + */ + stat = NFS3ERR_ROFS; break; case EDQUOT: @@ -3975,11 +3980,18 @@ nfs3_fh_auth_nfsop (nfs3_call_state_t *cs, gf_boolean_t is_write_op) { struct nfs_state *nfs = NULL; struct mount3_state *ms = NULL; + int auth_status = -1; nfs = (struct nfs_state *)cs->nfsx->private; ms = (struct mount3_state *)nfs->mstate; - return mnt3_authenticate_request (ms, cs->req, &cs->resolvefh, NULL, - NULL, NULL, NULL, is_write_op); + auth_status = mnt3_authenticate_request (ms, cs->req, &cs->resolvefh, + cs->vol->name, NULL, NULL, + NULL, is_write_op); + + if (auth_status != 0) { + cs->resolve_errno = auth_status; + } + return auth_status; } int diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c index 8b1d62b46ac..2426028ed2d 100644 --- a/xlators/nfs/server/src/nfs3.c +++ b/xlators/nfs/server/src/nfs3.c @@ -211,6 +211,25 @@ out: return ret; } +int +nfs3_is_exports_auth (struct nfs3_state *nfs3, const char *volname) +{ + int ret = 0; + struct nfs3_export *exp = NULL; + + GF_VALIDATE_OR_GOTO (GF_NFS3, nfs3, out); + + list_for_each_entry (exp, &nfs3->exports, explist) { + if (strcmp (exp->subvol->name, volname) == 0) { + ret = exp->exports_auth; + break; + } + } + +out: + return ret; +} + #define nfs3_map_fh_to_volume(nfs3state, handle, req, volume, status, label) \ do { \ @@ -413,6 +432,28 @@ out: } +/* + * This macro checks if the volume is started or not. + * If it is not started, it closes the client connection & logs it. + * + * Why do we do this? + * + * There is a "race condition" where gNFSd may start listening for RPC requests + * prior to the volume being started. Presumably, that is why this macro exists + * in the first place. In the NFS kernel client (specifically Linux's NFS + * kernel client), they establish a TCP connection to our endpoint and + * (re-)send requests. If we ignore the request, and return nothing back, + * the NFS kernel client waits forever for our response. If for some reason, + * the TCP connection were to die, and re-establish, the requests are + * retransmitted and everything begins working as expected + * + * Now, this is clearly bad behavior on the client side, + * but in order to make every user's life easier, + * gNFSd should simply disconnect the TCP connection if it sees requests + * before it is ready to accept them. + * + */ + #define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl) \ do { \ if ((!nfs_subvolume_started (nfs_state (nf3stt->nfsx), vlm))){\ @@ -420,11 +461,32 @@ out: NFS_MSG_VOL_DISABLE, \ "Volume is disabled: %s", \ vlm->name); \ + nfs3_disconnect_transport (req->trans); \ rtval = RPCSVC_ACTOR_IGNORE; \ goto erlbl; \ } \ } while (0) \ +void +nfs3_disconnect_transport (rpc_transport_t *transport) +{ + int ret = 0; + + GF_VALIDATE_OR_GOTO (GF_NFS3, transport, out); + + ret = rpc_transport_disconnect (transport); + if (ret != 0) { + gf_log (GF_NFS3, GF_LOG_WARNING, + "Unable to close client connection to %s.", + transport->peerinfo.identifier); + } else { + gf_log (GF_NFS3, GF_LOG_WARNING, + "Closed client connection to %s.", + transport->peerinfo.identifier); + } +out: + return; +} int nfs3_export_sync_trusted (struct nfs3_state *nfs3, uuid_t exportid) @@ -819,6 +881,12 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, cs = frame->local; if (op_ret == -1) { + /* Prevent crashes for the case where this call fails + * and buf is left in a NULL state, yet the op_errno == 0. + */ + if (!buf && op_errno == 0) { + op_errno = EIO; + } status = nfs3_cbk_errno_status (op_ret, op_errno); } @@ -5621,6 +5689,35 @@ no_dvm: (exp->trusted_sync == 0)?"no trusted_sync":"trusted_sync", (exp->trusted_write == 0)?"no trusted_write":"trusted_write"); ret = 0; + + ret = snprintf (searchkey, 1024, "nfs.%s.exports-auth-enable", name); + if (ret < 0) { + gf_log (GF_NFS, GF_LOG_ERROR, "snprintf failed"); + ret = -1; + goto err; + } + + if (dict_get (options, searchkey)) { + ret = dict_get_str (options, searchkey, &optstr); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict"); + goto err; + } + + ret = gf_string2boolean (optstr, &boolt); + if (ret < 0) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse bool " + "string"); + goto err; + } + + exp->exports_auth = boolt ? TRUE : FALSE; + if (boolt) { + struct nfs_state *priv = nfsx->private; + priv->nfs3state->exports_auth = boolt; + } + } + err: return ret; } @@ -5727,6 +5824,7 @@ nfs3_init_state (xlator_t *nfsx) goto ret; } + nfs->nfs3state = nfs3; nfs3->nfsx = nfsx; nfs3->exportslist = nfsx->children; INIT_LIST_HEAD (&nfs3->exports); @@ -5749,7 +5847,6 @@ nfs3_init_state (xlator_t *nfsx) goto free_localpool; } - nfs->nfs3state = nfs3; ret = 0; free_localpool: diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h index 4cb3e67528d..36d981c3eef 100644 --- a/xlators/nfs/server/src/nfs3.h +++ b/xlators/nfs/server/src/nfs3.h @@ -31,6 +31,7 @@ #define GF_NFS3_IOBPOOL_MULT GF_NFS_CONCURRENT_OPS_MULT #define GF_NFS3_CLTABLE_BUCKETS_MULT 2 #define GF_NFS3_FDTABLE_BUCKETS_MULT 2 +#define GF_NFS3_DEFAULT_EXPORT_AUTH _gf_false /* Static values used for FSINFO @@ -45,7 +46,7 @@ #define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */ #define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */ -#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX +#define GF_NFS3_FILE_IO_SIZE_DEF (512 * GF_UNIT_KB) #define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX #define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN @@ -99,6 +100,7 @@ struct nfs3_export { int trusted_sync; int trusted_write; int rootlookedup; + int exports_auth; }; #define GF_NFS3_DEFAULT_VOLACCESS (GF_NFS3_VOLACCESS_RW) @@ -142,6 +144,9 @@ typedef struct nfs3_state { gf_lock_t fdlrulock; int fdcount; uint32_t occ_logger; + + /* Enable exports auth model */ + gf_boolean_t exports_auth; } nfs3_state_t; typedef enum nfs3_lookup_type { @@ -280,4 +285,7 @@ nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options); extern uint64_t nfs3_request_xlator_deviceid (rpcsvc_request_t *req); +extern int +nfs3_is_exports_auth (struct nfs3_state *nfs3, const char *volname); + #endif diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index 98c37746921..f199b229bc2 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -1479,6 +1479,74 @@ ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, return 0; } +int32_t +ioc_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct statvfs *buf, dict_t *xdata) +{ + ioc_table_t *table = NULL; + struct ioc_statvfs *cache = NULL; + + if (op_ret != 0) + goto out; + + table = this->private; + cache = &table->statfs_cache; + + LOCK (&cache->lock); + + gettimeofday (&cache->tv, NULL); + cache->buf = *buf; + + UNLOCK (&cache->lock); + +out: + STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +ioc_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ioc_table_t *table = NULL; + struct ioc_statvfs *cache = NULL; + struct statvfs buf; + struct timeval tv = {0,}; + + table = this->private; + cache = &table->statfs_cache; + + if (!cache->enabled) + goto disabled; + + gettimeofday (&tv, NULL); + + LOCK (&cache->lock); + + if (time_elapsed (&tv, &cache->tv) >= cache->timeout) { + UNLOCK (&cache->lock); + goto uncached; + } + + buf = cache->buf; + + UNLOCK (&cache->lock); + + STACK_UNWIND_STRICT (statfs, frame, 0, 0, &buf, xdata); + + return 0; + +disabled: + STACK_WIND_TAIL (frame, FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->statfs, loc, xdata); + return 0; + +uncached: + STACK_WIND (frame, ioc_statfs_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->statfs, loc, xdata); + return 0; +} int32_t ioc_get_priority_list (const char *opt_str, struct list_head *first) @@ -1696,6 +1764,13 @@ reconfigure (xlator_t *this, dict_t *options) } table->cache_size = cache_size_new; + GF_OPTION_RECONF ("statfs-cache", table->statfs_cache.enabled, + options, bool, unlock); + + GF_OPTION_RECONF ("statfs-cache-timeout", + table->statfs_cache.timeout, + options, int32, unlock); + ret = 0; } unlock: @@ -1755,6 +1830,10 @@ init (xlator_t *this) GF_OPTION_INIT ("max-file-size", table->max_file_size, size_uint64, out); + GF_OPTION_INIT ("statfs-cache", table->statfs_cache.enabled, bool, out); + + GF_OPTION_INIT ("statfs-cache-timeout", table->statfs_cache.timeout, int32, out); + if (!check_cache_size_ok (this, table->cache_size)) { ret = -1; goto out; @@ -1827,6 +1906,11 @@ init (xlator_t *this) ctx = this->ctx; ioc_log2_page_size = log_base2 (ctx->page_size); + LOCK_INIT (&table->statfs_cache.lock); + /* Invalidate statfs cache */ + table->statfs_cache.tv.tv_sec = 0; + table->statfs_cache.tv.tv_usec = 0; + out: if (ret == -1) { if (table != NULL) { @@ -2096,6 +2180,7 @@ fini (xlator_t *this) GF_ASSERT (list_empty (&table->inode_lru[i])); } + LOCK_DESTROY (&table->statfs_cache.lock); GF_ASSERT (list_empty (&table->inodes)); */ pthread_mutex_destroy (&table->table_lock); @@ -2120,6 +2205,7 @@ struct xlator_fops fops = { .readdirp = ioc_readdirp, .discard = ioc_discard, .zerofill = ioc_zerofill, + .statfs = ioc_statfs, }; @@ -2171,5 +2257,21 @@ struct volume_options options[] = { .description = "Maximum file size which would be cached by the " "io-cache translator." }, + { .key = {"statfs-cache"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "0", + .description = "The cached statfs for a filesystem will be " + "till 'statfs-cache-timeout' seconds, after which re-validation " + "is performed." + }, + { .key = {"statfs-cache-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60, + .default_value = "1", + .description = "The cached statfs for a filesystem will be " + "till 'statfs-cache-timeout' seconds, after which re-validation " + "is performed." + }, { .key = {NULL} }, }; diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h index d7c823fe962..da71b2f2371 100644 --- a/xlators/performance/io-cache/src/io-cache.h +++ b/xlators/performance/io-cache/src/io-cache.h @@ -148,23 +148,32 @@ struct ioc_inode { inode_t *inode; }; +struct ioc_statvfs { + struct statvfs buf; + int32_t timeout; + struct timeval tv; + gf_boolean_t enabled; + gf_lock_t lock; +}; + struct ioc_table { - uint64_t page_size; - uint64_t cache_size; - uint64_t cache_used; - uint64_t min_file_size; - uint64_t max_file_size; - struct list_head inodes; /* list of inodes cached */ - struct list_head active; - struct list_head *inode_lru; - struct list_head priority_list; - int32_t readv_count; - pthread_mutex_t table_lock; - xlator_t *xl; - uint32_t inode_count; - int32_t cache_timeout; - int32_t max_pri; - struct mem_pool *mem_pool; + uint64_t page_size; + uint64_t cache_size; + uint64_t cache_used; + uint64_t min_file_size; + uint64_t max_file_size; + struct list_head inodes; /* list of inodes cached */ + struct list_head active; + struct list_head *inode_lru; + struct list_head priority_list; + int32_t readv_count; + pthread_mutex_t table_lock; + xlator_t *xl; + uint32_t inode_count; + int32_t cache_timeout; + int32_t max_pri; + struct mem_pool *mem_pool; + struct ioc_statvfs statfs_cache; }; typedef struct ioc_table ioc_table_t; diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 72a82082563..7f9dc5f82a8 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -161,8 +161,6 @@ iot_worker (void *data) THIS = this; for (;;) { - sleep_till.tv_sec = time (NULL) + conf->idle_time; - pthread_mutex_lock (&conf->mutex); { if (pri != -1) { @@ -175,8 +173,11 @@ iot_worker (void *data) break; } - conf->sleep_count++; + clock_gettime (CLOCK_REALTIME_COARSE, + &sleep_till); + sleep_till.tv_sec += conf->idle_time; + conf->sleep_count++; ret = pthread_cond_timedwait (&conf->cond, &conf->mutex, &sleep_till); @@ -232,14 +233,25 @@ int do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri) { int ret = 0; + int active_count = 0; pthread_mutex_lock (&conf->mutex); { __iot_enqueue (conf, stub, pri); - pthread_cond_signal (&conf->cond); - - ret = __iot_workers_scale (conf); + /* If we have an ample supply of threads alive already + * it's massively more efficient to keep the ones you have + * busy vs making new ones and signaling everyone + */ + active_count = conf->curr_count - conf->sleep_count; + if (conf->fops_per_thread_ratio == 0 || active_count == 0 || + (conf->queue_size/active_count > + conf->fops_per_thread_ratio && + active_count < conf->max_count)) { + pthread_cond_signal (&conf->cond); + + ret = __iot_workers_scale (conf); + } } pthread_mutex_unlock (&conf->mutex); @@ -266,6 +278,9 @@ iot_get_pri_meaning (iot_pri_t pri) case IOT_PRI_MAX: name = "invalid"; break; + case IOT_PRI_UNSPEC: + name = "unspecified"; + break; } return name; } @@ -598,6 +613,34 @@ int iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { + iot_conf_t *conf = NULL; + dict_t *depths = NULL; + int i = 0; + + conf = this->private; + + if (conf && name && strcmp (name, IO_THREADS_QUEUE_SIZE_KEY) == 0) { + // We explicitly do not want a reference count + // for this dict in this translator + depths = get_new_dict (); + if (!depths) + goto unwind_special_getxattr; + + for (i = 0; i < IOT_PRI_MAX; i++) { + if (dict_set_int32 (depths, + (char *)fop_pri_to_string (i), + conf->queue_sizes[i]) != 0) { + dict_destroy (depths); + depths = NULL; + goto unwind_special_getxattr; + } + } + +unwind_special_getxattr: + STACK_UNWIND_STRICT (getxattr, frame, 0, 0, depths, xdata); + return 0; + } + IOT_FOP (getxattr, frame, this, loc, name, xdata); return 0; } @@ -904,6 +947,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); + GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio, + options, int32, out); + GF_OPTION_RECONF ("high-prio-threads", conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); @@ -978,6 +1024,9 @@ init (xlator_t *this) GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); + GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio, + int32, out); + GF_OPTION_INIT ("high-prio-threads", conf->ac_iot_limit[IOT_PRI_HI], int32, out); @@ -1140,6 +1189,20 @@ struct volume_options options[] = { "perform concurrent IO operations" }, + { .key = {"fops-per-thread-ratio"}, + .type = GF_OPTION_TYPE_INT, + .min = IOT_MIN_FOP_PER_THREAD, + .max = IOT_MAX_FOP_PER_THREAD, + .default_value = "20", + .description = "The optimal ratio of threads to FOPs in the queue " + "we wish to achieve before creating a new thread. " + "The idea here is it's far cheaper to keep our " + "currently running threads busy than spin up " + "new threads or cause a stampeding herd of threads " + "to service a singlular FOP when you have a thread " + "which will momentarily become available to do the " + "work." + }, { .key = {"high-prio-threads"}, .type = GF_OPTION_TYPE_INT, .min = IOT_MIN_THREADS, diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index fa955b5954b..011d4a00f7f 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -34,20 +34,14 @@ struct iot_conf; #define IOT_MIN_THREADS 1 #define IOT_DEFAULT_THREADS 16 -#define IOT_MAX_THREADS 64 +#define IOT_MAX_THREADS 256 +#define IOT_MIN_FOP_PER_THREAD 0 +#define IOT_MAX_FOP_PER_THREAD 2000 #define IOT_THREAD_STACK_SIZE ((size_t)(1024*1024)) -typedef enum { - IOT_PRI_HI = 0, /* low latency */ - IOT_PRI_NORMAL, /* normal */ - IOT_PRI_LO, /* bulk */ - IOT_PRI_LEAST, /* least */ - IOT_PRI_MAX, -} iot_pri_t; - #define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */ struct iot_least_throttle { struct timeval sample_time; /* timestamp of current sample */ @@ -62,6 +56,7 @@ struct iot_conf { pthread_cond_t cond; int32_t max_count; /* configured maximum */ + int32_t fops_per_thread_ratio; int32_t curr_count; /* actual number of threads running */ int32_t sleep_count; diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 30443761c56..c3baafdc1b6 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -33,6 +33,7 @@ struct mdc_conf { gf_boolean_t cache_selinux; gf_boolean_t force_readdirp; gf_boolean_t cache_swift_metadata; + gf_boolean_t cache_all_xattrs; }; @@ -792,6 +793,7 @@ struct checkpair { static int is_mdc_key_satisfied (const char *key) { + unsigned int checked_keys = 0; const char *mdc_key = NULL; int i = 0; @@ -801,11 +803,13 @@ is_mdc_key_satisfied (const char *key) for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) { if (!mdc_keys[i].load) continue; + + checked_keys++; if (strcmp (mdc_key, key) == 0) return 1; } - return 0; + return 0; } @@ -875,7 +879,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_rsp = NULL; dict_t *xattr_alloc = NULL; mdc_local_t *local = NULL; - + struct mdc_conf *conf = this->private; local = mdc_local_get (frame); if (!local) @@ -899,10 +903,17 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (ret != 0) goto uncached; - if (!mdc_xattr_satisfied (this, xdata, xattr_rsp)) + /* Only check the keys if we are not caching all the xattrs */ + if (!conf->cache_all_xattrs && + !mdc_xattr_satisfied (this, xdata, xattr_rsp)) { goto uncached; + } } + gf_msg (this->name, GF_LOG_TRACE, 0, 0, + "Returning lookup from cache for gfid %s", + uuid_utoa(loc->inode->gfid)); + MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf, xattr_rsp, &postparent); @@ -1882,6 +1893,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, int op_errno = ENODATA; mdc_local_t *local = NULL; dict_t *xattr = NULL; + struct mdc_conf *conf = this->private; local = mdc_local_get (frame); if (!local) @@ -1897,7 +1909,18 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, goto uncached; if (!xattr || !dict_get (xattr, (char *)key)) { - ret = -1; + /* If we can't find the extended attribute, & cache-all-xattrs + * is enabled, we should wind and try to find them. + * + * NOTE: Quota & AFR queries through the mount + * (i.e, virtual Gluster xattrs) + * won't work unless we do this. + */ + if (conf->cache_all_xattrs) { + goto uncached; + } + + ret = -1; op_errno = ENODATA; } @@ -2363,7 +2386,8 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out); - + GF_OPTION_RECONF("cache-all-xattrs", conf->cache_all_xattrs, options, + bool, out); out: return 0; } @@ -2404,6 +2428,7 @@ init (xlator_t *this) conf->cache_swift_metadata); GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); + GF_OPTION_INIT ("cache-all-xattrs", conf->cache_all_xattrs, bool, out); out: this->private = conf; @@ -2474,7 +2499,7 @@ struct volume_options options[] = { { .key = {"md-cache-timeout"}, .type = GF_OPTION_TYPE_INT, .min = 0, - .max = 60, + .max = 300, .default_value = "1", .description = "Time period after which cache has to be refreshed", }, @@ -2484,5 +2509,19 @@ struct volume_options options[] = { .description = "Convert all readdir requests to readdirplus to " "collect stat info on each entry.", }, + { .key = {"strict-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "When reading extended attributes from the cache, " + "if an xattr is not found, attempt to find it by winding " + "instead of returning ENODATA. This is necessary to query " + "the special extended attributes (trusted.glusterfs.quota.size) " + "through a FUSE mount with md-cache enabled." + }, + { .key = {"cache-all-xattrs"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Cache all the extended attributes for an inode.", + }, { .key = {NULL} }, }; diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 7f5719b1e48..bc59036ff88 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -169,6 +169,7 @@ typedef struct wb_request { typedef struct wb_conf { uint64_t aggregate_size; + uint64_t page_size; uint64_t window_size; gf_boolean_t flush_behind; gf_boolean_t trickling_writes; @@ -1207,18 +1208,21 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req) char *ptr = NULL; struct iobuf *iobuf = NULL; struct iobref *iobref = NULL; + struct wb_conf *conf = NULL; int ret = -1; ssize_t required_size = 0; size_t holder_len = 0; size_t req_len = 0; + conf = req->wb_inode->this->private; + if (!holder->iobref) { holder_len = iov_length (holder->stub->args.vector, holder->stub->args.count); req_len = iov_length (req->stub->args.vector, req->stub->args.count); - required_size = max ((THIS->ctx->page_size), + required_size = max ((conf->page_size), (holder_len + req_len)); iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool, required_size); @@ -1281,7 +1285,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) wb_request_t *holder = NULL; wb_conf_t *conf = NULL; int ret = 0; - ssize_t page_size = 0; /* With asynchronous IO from a VM guest (as a file), there can be two sequential writes happening in two regions @@ -1292,7 +1295,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) through the interleaved ops */ - page_size = wb_inode->this->ctx->page_size; conf = wb_inode->this->private; list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { @@ -1343,7 +1345,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode) continue; } - space_left = page_size - holder->write_size; + space_left = wb_inode->window_conf - holder->write_size; if (space_left < req->write_size) { holder->ordering.go = 1; @@ -2471,6 +2473,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out); + GF_OPTION_RECONF ("cache-size", conf->page_size, options, size_uint64, + out); + GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool, out); @@ -2522,6 +2527,7 @@ init (xlator_t *this) /* configure 'option window-size <size>' */ GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out); + GF_OPTION_INIT ("cache-size", conf->page_size, size_uint64, out); if (!conf->window_size && conf->aggregate_size) { gf_msg (this->name, GF_LOG_WARNING, 0, diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index 988c1dce758..d0c63c18b46 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -15,6 +15,7 @@ #include "glusterfs.h" #include "statedump.h" #include "compat-errno.h" +#include "latency.h" #include "glusterfs3.h" #include "portmap-xdr.h" @@ -1542,7 +1543,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi rpc_clnt_reconfig (conf->rpc, &config); conf->skip_notify = 1; - conf->quick_reconnect = 1; + conf->quick_reconnect = 1; out: if (frame) diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 66f15b8a67c..aa9cf9b31e4 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -467,7 +467,7 @@ int32_t client_forget (xlator_t *this, inode_t *inode) { /* Nothing here */ - return 0; + return 0; } int32_t @@ -545,7 +545,7 @@ out: STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -571,7 +571,7 @@ out: if (ret) STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -600,7 +600,7 @@ out: STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -628,7 +628,7 @@ out: if (ret) STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -657,7 +657,7 @@ out: if (ret) STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -687,7 +687,7 @@ out: if (ret) STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -718,7 +718,7 @@ out: STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -748,7 +748,7 @@ out: STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -778,7 +778,7 @@ out: STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -807,7 +807,7 @@ out: STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -837,7 +837,7 @@ out: STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -867,7 +867,7 @@ out: STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -897,7 +897,7 @@ out: STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -932,7 +932,7 @@ out: STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -965,7 +965,7 @@ out: if (ret) STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1000,7 +1000,7 @@ out: STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; } @@ -1038,7 +1038,7 @@ out: if (ret) STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1064,7 +1064,7 @@ out: if (ret) STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1093,7 +1093,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1120,7 +1120,7 @@ out: if (ret) STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1149,7 +1149,7 @@ out: if (ret) STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1177,7 +1177,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1204,7 +1204,7 @@ out: if (ret) STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } static gf_boolean_t @@ -1393,7 +1393,7 @@ out: if (need_unwind) STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); - return 0; + return 0; } @@ -1423,7 +1423,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1453,7 +1453,7 @@ out: if (ret) STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1482,7 +1482,7 @@ out: if (ret) STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1512,7 +1512,7 @@ out: if (ret) STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1542,7 +1542,7 @@ out: if (ret) STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1571,7 +1571,7 @@ out: if (ret) STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1598,7 +1598,7 @@ out: if (ret) STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1654,7 +1654,7 @@ out: if (ret) STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1684,7 +1684,7 @@ out: if (ret) STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1715,7 +1715,7 @@ out: if (ret) STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1747,7 +1747,7 @@ out: if (ret) STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1780,7 +1780,7 @@ out: if (ret) STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1809,7 +1809,7 @@ out: if (ret) STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL); - return 0; + return 0; } int32_t @@ -1840,7 +1840,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1872,7 +1872,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1901,7 +1901,7 @@ out: if (ret) STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -1929,7 +1929,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -2155,7 +2155,7 @@ out: if (ret) STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL); - return 0; + return 0; } @@ -2227,6 +2227,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf = this->private; switch (event) { + case RPC_CLNT_PING: + { + ret = default_notify (this, GF_EVENT_CHILD_PING, NULL); + if (ret) + gf_log (this->name, GF_LOG_INFO, + "CHILD_PING notify failed"); + conf->last_sent_event = GF_EVENT_CHILD_PING; + break; + } case RPC_CLNT_CONNECT: { conf->connected = 1; @@ -2312,13 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf->connected = 0; conf->skip_notify = 0; - if (conf->quick_reconnect) { - conf->quick_reconnect = 0; - rpc_clnt_cleanup_and_start (rpc); - - } else { + if (conf->rpc->conn.connected) { + /* Having conf->connected false and + * conf->rpc->conn.connected true is an + * unrecoverable state, since rpc_clnt_reconnect + * will do nothing for an already connected connection. + * A good fix would be to ensure serialized + * delivery of transport messages, but that is super hard + * and this is rare. So... ghetto "fix", disconnect the + * RPC and start the race again. Maybe we'll win + * next time! + */ + gf_log (this->name, GF_LOG_WARNING, + "Client %s reconnect race detected, " + "restarting.", conf->rpc->conn.name); + conf->quick_reconnect = 1; + rpc_transport_disconnect (rpc->conn.trans); rpc->conn.config.remote_port = 0; - + } else { + if (conf->quick_reconnect) { + conf->quick_reconnect = 0; + rpc_clnt_cleanup_and_start (rpc); + } else { + rpc->conn.config.remote_port = 0; + } } break; @@ -2670,7 +2696,7 @@ reconfigure (xlator_t *this, dict_t *options) ret = 0; out: - return ret; + return ret; } @@ -2724,6 +2750,8 @@ init (xlator_t *this) this->private = conf; + this->client_latency.min = UINT64_MAX; + /* If it returns -1, then its a failure, if it returns +1 we need have to understand that 'this' is subvolume of a xlator which, will set the remote host and remote subvolume in a setxattr @@ -3001,7 +3029,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_TIME, .min = 0, .max = 1013, - .default_value = "42", + .default_value = "180", .description = "Time duration for which the client waits to " "check if the server is responsive." }, diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c index 1ad45394dd7..a1fe2e85267 100644 --- a/xlators/protocol/server/src/server-resolve.c +++ b/xlators/protocol/server/src/server-resolve.c @@ -11,6 +11,7 @@ #include "server.h" #include "server-helpers.h" #include "server-messages.h" +#include "compat-errno.h" int @@ -58,6 +59,10 @@ resolve_gfid_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, resolve = state->resolve_now; resolve_loc = &resolve->resolve_loc; + if (!state->loc.inode && inode) { + state->loc.inode = inode_ref (inode); + } + if (op_ret == -1) { if (op_errno == ENOENT) { gf_msg_debug (this->name, 0, "%s/%s: failed to resolve" @@ -71,7 +76,9 @@ resolve_gfid_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uuid_utoa (resolve_loc->pargfid), resolve_loc->name, strerror (op_errno)); } - goto out; + if (op_errno != ENODATA) { + goto out; + } } link_inode = inode_link (inode, resolve_loc->parent, diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c index d5410573ac3..ee8ce825098 100644 --- a/xlators/protocol/server/src/server-rpc-fops.c +++ b/xlators/protocol/server/src/server-rpc-fops.c @@ -33,6 +33,10 @@ void forget_inode_if_no_dentry (inode_t *inode) { + if (!inode) { + return; + } + if (!inode_has_dentry (inode)) inode_forget (inode, 0); @@ -4644,7 +4648,7 @@ server3_3_unlink (rpcsvc_request_t *req) goto out; } - state->resolve.type = RESOLVE_MUST; + state->resolve.type = RESOLVE_MAY; state->resolve.bname = gf_strdup (args.bname); memcpy (state->resolve.pargfid, args.pargfid, 16); @@ -5642,7 +5646,7 @@ server3_3_rmdir (rpcsvc_request_t *req) goto out; } - state->resolve.type = RESOLVE_MUST; + state->resolve.type = RESOLVE_MAY; memcpy (state->resolve.pargfid, args.pargfid, 16); state->resolve.bname = gf_strdup (args.bname); diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index d8ef5f7b73f..636108affbb 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + goto err; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index d3f48f859bf..558755af009 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -210,6 +210,12 @@ posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize, goto out; } + if (!inode && path) { + gf_log (this->name, GF_LOG_WARNING, "OOPS: Failed to resolve" + "path (%s), inode is null. Bailing!", path); + goto out; + } + ret = posix_make_ancestral_node (priv_base_path, path, pathsize, head, dir_name, &iabuf, inode, type, xdata); if (*parent != NULL) { diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 76e32a31594..4aa39514486 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -485,18 +485,21 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, "Failed to set dictionary value for %s", key); } - } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) { + } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY) && + filler->loc && filler->loc->inode && + !gf_uuid_is_null (filler->loc->inode->gfid)) { /* As of now, the only consumers of POSIX_ANCESTRY_PATH attempt * fetching it via path-based fops. Hence, leaving it as it is * for now. */ if (!filler->real_path) goto out; + char *path = NULL; ret = posix_get_ancestry (filler->this, filler->loc->inode, NULL, &path, POSIX_ANCESTRY_PATH, &filler->op_errno, xattr_req); - if (ret < 0) { + if (ret < 0 || !path) { goto out; } @@ -856,6 +859,7 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) int ret = 0; ssize_t size = 0; struct stat stat = {0, }; + char *new_uuid = NULL; if (!xattr_req) @@ -864,12 +868,6 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); - if (size == 16) { - ret = 0; - goto verify_handle; - } - ret = dict_get_ptr (xattr_req, "gfid-req", &uuid_req); if (ret) { gf_msg_debug (this->name, 0, @@ -878,7 +876,28 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) goto out; } - ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE); + size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { + if (!gf_uuid_compare (uuid_curr, uuid_req)) { + ret = 0; + goto verify_handle; + } + + /* File has an existing GFID which differs from + * the requested one. This can occur when a subvolume + * has been offline while a file is deleted, and then + * comes back up but has not yet healed. Get rid of + * the old GFID link (handle_unset) and fall through + * to the set case below. + */ + new_uuid = strdupa (uuid_utoa (uuid_req)); + gf_log (this->name, GF_LOG_WARNING, + "%s: existing gfid %s overwritten with %s.", + path, uuid_utoa (uuid_curr), new_uuid); + posix_handle_unset (this, uuid_curr, NULL); + } + + ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, 0); if (ret == -1) { gf_msg (this->name, GF_LOG_WARNING, errno, P_MSG_GFID_FAILED, "setting GFID on %s failed ", path); diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index dfb7e05e49a..e56e71e8c27 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -183,8 +183,15 @@ posix_lookup (call_frame_t *frame, xlator_t *this, op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless); op_ret = -1; if (gf_uuid_is_null (loc->pargfid) || (loc->name == NULL)) { - /* nameless lookup */ - MAKE_INODE_HANDLE (real_path, this, loc, &buf); + if (gf_uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "OOPS: Namless lookup with null gfid!"); + op_errno = EINVAL; + op_ret = -1; + goto out; + } else { + MAKE_INODE_HANDLE (real_path, this, loc, &buf); + } } else { MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); @@ -220,7 +227,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, } if (priv->update_pgfid_nlinks) { - if (!gf_uuid_is_null (loc->pargfid) && !IA_ISDIR (buf.ia_type)) { + if (!gf_uuid_is_null (loc->pargfid)) { MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, loc->pargfid); @@ -691,6 +698,81 @@ out: return 0; } +static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats, + double min_free_disk, + gf_boolean_t previously_ok) +{ + gf_boolean_t currently_ok; + + if (min_free_disk < 100.0) { + double free_percent = 100.0 * stats->f_bavail / stats->f_blocks; + + currently_ok = + free_percent >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free percent " + "%f%% < %f%%. Writes disabled.", + free_percent, min_free_disk); + } + } else { + double free_bytes = stats->f_bavail * stats->f_frsize; + + currently_ok = + free_bytes >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free bytes %f " + "< %f. Writes disabled.", + free_bytes, min_free_disk); + } + } + + if (currently_ok && !previously_ok) { + gf_log (this->name, GF_LOG_INFO, "Free space has risen above " + "min-free-disk limit, writes " + "re-enabled."); + } + + return currently_ok; +} + +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv) +{ + /* Check if there is sufficient free space to allow writes. + * + * This is called in the write path, so performance matters. We + * periodically sample free space by calling statvfs(). + * freespace_check_lock is used to ensure only one process at a + * time makes the call; if the lock is contended, the previous + * status (reflected in freespace_check_passed) is used while + * the process that holds the mutex updates the current status. + */ + if (!priv->freespace_check_interval) { + return _gf_true; + } + + if (!pthread_mutex_trylock (&priv->freespace_check_lock)) { + struct timespec now; + + clock_gettime (CLOCK_MONOTONIC, &now); + if (now.tv_sec >= priv->freespace_check_last.tv_sec + + priv->freespace_check_interval) { + sys_statvfs (priv->base_path, &priv->freespace_stats); + priv->freespace_check_last.tv_sec = now.tv_sec; + + priv->freespace_check_passed = freespace_ok ( + this, &priv->freespace_stats, priv->min_free_disk, + priv->freespace_check_passed); + } + + pthread_mutex_unlock (&priv->freespace_check_lock); + } + + return priv->freespace_check_passed; +} + static int32_t posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, off_t offset, size_t len, @@ -700,6 +782,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t op_errno = 0; struct posix_fd *pfd = NULL; gf_boolean_t locked = _gf_false; + struct posix_private *priv = this->private; posix_inode_ctx_t *ctx = NULL; DECLARE_OLD_FS_ID_VAR; @@ -709,6 +792,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + if (!posix_write_ok (this, priv)) { + ret = -ENOSPC; + goto out; + } ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { @@ -2514,8 +2603,7 @@ posix_rename (call_frame_t *frame, xlator_t *this, pthread_mutex_lock (&ctx_old->pgfid_lock); { - if (!IA_ISDIR (oldloc->inode->ia_type) - && priv->update_pgfid_nlinks) { + if (priv->update_pgfid_nlinks) { MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, oldloc->pargfid); @@ -2581,8 +2669,7 @@ posix_rename (call_frame_t *frame, xlator_t *this, P_MSG_SET_XDATA_FAIL, "failed to set " GET_LINK_COUNT" for %s", real_newpath); - if (!IA_ISDIR (oldloc->inode->ia_type) - && priv->update_pgfid_nlinks) { + if (priv->update_pgfid_nlinks) { MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, newloc->pargfid); @@ -3386,6 +3473,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (priv, out); + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + op_ret = -1; + goto out; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, @@ -4335,6 +4428,12 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, op_errno, xdata); } + if (ret == 0 && path && !*path) { + gf_log (this->name, GF_LOG_DEBUG, + "Failed to resolve ancestry path, pgfid " + "attribute isn't set (yet)."); + ret = -1; + } out: if (ret && path && *path) { GF_FREE (*path); @@ -4555,7 +4654,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, goto done; } - if (loc->inode && name + if (loc->inode && !gf_uuid_is_null(loc->inode->gfid) && name && (strcmp (name, GET_ANCESTRY_PATH_KEY) == 0)) { int type = POSIX_ANCESTRY_PATH; @@ -6761,6 +6860,16 @@ struct posix_private *priv = NULL; options, uint32, out); posix_spawn_health_check_thread (this); + pthread_mutex_lock (&priv->freespace_check_lock); + { + GF_OPTION_RECONF ("freespace-check-interval", + priv->freespace_check_interval, + options, uint32, out); + GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options, + percent_or_size, out); + } + pthread_mutex_unlock (&priv->freespace_check_lock); + ret = 0; out: return ret; @@ -7375,6 +7484,19 @@ init (xlator_t *this) GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, uint32, out); + + GF_OPTION_INIT ("freespace-check-interval", + _private->freespace_check_interval, uint32, out); + + GF_OPTION_INIT ("min-free-disk", _private->min_free_disk, + percent_or_size, out); + + pthread_mutex_init (&_private->freespace_check_lock, NULL); + sys_statvfs (_private->base_path, &_private->freespace_stats); + clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last); + _private->freespace_check_passed = freespace_ok ( + this, &_private->freespace_stats, _private->min_free_disk, + _gf_true); out: return ret; } @@ -7539,7 +7661,7 @@ struct volume_options options[] = { }, { .key = {"update-link-count-parent"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", + .default_value = "on", .description = "Enable placeholders for gfid to path conversion" }, #if GF_DARWIN_HOST_OS @@ -7552,5 +7674,22 @@ struct volume_options options[] = { "\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n" }, #endif + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "2%", + .description = "Minimum percentage/size of disk space, after which we" + "start failing writes with ENOSPC." + }, + { + .key = {"freespace-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "5", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds between freespace measurements " + "used for the min-free-disk determination. " + "Set to 0 to disable." + }, + { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index febd4326aa1..a2e1201dd72 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -174,7 +174,14 @@ struct posix_private { XATTR_BOTH, } xattr_user_namespace; #endif - + /* freespace_check_lock protects access to following three fields. */ + pthread_mutex_t freespace_check_lock; + struct timespec freespace_check_last; + struct statvfs freespace_stats; + double min_free_disk; + /* mutex protection ends. */ + uint32_t freespace_check_interval; + gf_boolean_t freespace_check_passed; }; typedef struct { @@ -280,6 +287,9 @@ posix_handle_georep_xattrs (call_frame_t *, const char *, int *, gf_boolean_t); void posix_gfid_unset (xlator_t *this, dict_t *xdata); +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv); + int posix_pacl_set (const char *path, const char *key, const char *acl_s); |