diff options
124 files changed, 6720 insertions, 518 deletions
diff --git a/api/src/glfs-mgmt.c b/api/src/glfs-mgmt.c index 8c9872cfa53..5d08114c8c5 100644 --- a/api/src/glfs-mgmt.c +++ b/api/src/glfs-mgmt.c @@ -911,7 +911,8 @@ glfs_mgmt_init (struct glfs *fs)          if (!strcmp (cmd_args->volfile_server_transport, "unix")) {                  ret = rpc_transport_unix_options_build (&options, host, 0);          } else { -                ret = rpc_transport_inet_options_build (&options, host, port); +                ret = rpc_transport_inet_options_build (&options, host, port, +                                                        NULL);          }  	if (ret) diff --git a/cli/src/cli.c b/cli/src/cli.c index 2ecaae415d6..fa507309e80 100644 --- a/cli/src/cli.c +++ b/cli/src/cli.c @@ -586,6 +586,11 @@ cli_rpc_init (struct cli_state *state)          int                     ret = -1;          int                     port = CLI_GLUSTERD_PORT;          xlator_t                *this = NULL; +#ifdef IPV6_DEFAULT +        char                    *addr_family = "inet6"; +#else +        char                    *addr_family = "inet"; +#endif          this = THIS;          cli_rpc_prog = &cli_prog; @@ -621,7 +626,8 @@ cli_rpc_init (struct cli_state *state)                          goto out;                  ret = dict_set_str (options, "transport.address-family", -                                    "inet"); +                                        addr_family); +                  if (ret)                          goto out;          } diff --git a/configure.ac b/configure.ac index 5a7231262d9..4c2bb32fe23 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,8 @@ AC_CONFIG_FILES([Makefile                  xlators/cluster/Makefile                  xlators/cluster/afr/Makefile                  xlators/cluster/afr/src/Makefile +                xlators/cluster/aha/Makefile +                xlators/cluster/aha/src/Makefile                  xlators/cluster/stripe/Makefile                  xlators/cluster/stripe/src/Makefile                  xlators/cluster/dht/Makefile @@ -275,7 +277,19 @@ if test "x$enable_debug" = "xyes"; then          CFLAGS="${CFLAGS} -g -O0 -DDEBUG"  else          BUILD_DEBUG=no -        CFLAGS="${CFLAGS} -g -O2" +        CFLAGS="${CFLAGS} -g" +fi + +AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.])) +if test "x$with_fbextras" = "xyes"; then +        BUILD_FBEXTRAS=yes +else +        BUILD_FBEXTRAS=no +fi + +AC_ARG_ENABLE([privport_prefer], AC_HELP_STRING([--disable-privport_prefer], [Disable preferred usage of privleged ports.])) +if test "x$enable_privport_prefer" = "xno"; then +    CFLAGS="${CFLAGS} -DNO_PRIVPORT"  fi  case $host_os in @@ -908,6 +922,16 @@ AC_SUBST(GF_DISTRIBUTION)  GF_HOST_OS=""  GF_LDFLAGS="-rdynamic" +TESTER_CFLAGS="" + +dnl include tirpc for FB builds +if test "x$BUILD_FBEXTRAS" = "xyes"; then +        TIRPC_CFLAGS="-I/usr/include/tirpc" +        GF_LDFLAGS="-lfbtirpc $GF_LDFLAGS" +        GF_CFLAGS="$GF_CFLAGS $TIRPC_CFLAGS -DIPV6_DEFAULT -DGF_FBEXTRAS" +        TESTER_CFLAGS="$TESTER_CFLAGS -lfbtirpc" +fi +  dnl check for gcc -Werror=format-security  saved_CFLAGS=$CFLAGS  CFLAGS="-Wformat -Werror=format-security" @@ -1099,6 +1123,12 @@ AC_ARG_ENABLE([debug],                AC_HELP_STRING([--enable-debug],                               [Enable debug build options.])) +AC_ARG_ENABLE([mempool], +              AC_HELP_STRING([--disable-mempool], +                             [Disable the Gluster memory pooler.])) +if test "x$enable_mempool" = "xno"; then +        CFLAGS="${CFLAGS} -DDISABLE_MEMPOOL" +fi  # syslog section  AC_ARG_ENABLE([syslog], @@ -1294,12 +1324,14 @@ AC_SUBST([GF_CPPFLAGS])  AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS")  AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS")  AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS") +AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes")  AC_SUBST(GLUSTERD_WORKDIR)  AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd )  AC_SUBST(GLUSTERD_VOLFILE)  AC_SUBST(GLUSTERFS_LIBEXECDIR)  AC_SUBST(GLUSTERFSD_MISCDIR) +AC_SUBST(TESTER_CFLAGS)  dnl pkg-config versioning  dnl diff --git a/glusterfs.spec.in b/glusterfs.spec.in index 66c9a46a2be..29d07c530c0 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -13,6 +13,10 @@  # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with debug  %{?_with_debug:%global _with_debug --enable-debug} +# if you wish to compile an rpm with Facebook specfic extras... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras +%{?_with_fbextras:%global _with_fbextras --with-fbextras} +  # if you wish to compile an rpm with cmocka unit testing...  # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka  %{?_with_cmocka:%global _with_cmocka --enable-cmocka} @@ -196,6 +200,9 @@ BuildRequires:    libxml2-devel openssl-devel  BuildRequires:    libaio-devel libacl-devel  BuildRequires:    python-devel  BuildRequires:    python-ctypes +%if ( 0%{?_with_fbextras:1} ) +BuildRequires:    fb-libtirpc fb-libtirpc-devel +%endif  BuildRequires:    userspace-rcu-devel >= 0.7  %if ( 0%{?rhel} && 0%{?rhel} <= 6 )  BuildRequires:    automake @@ -513,6 +520,9 @@ Requires:         %{name}-cli%{?_isa} = %{version}-%{release}  Requires:         %{name}-libs%{?_isa} = %{version}-%{release}  # some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse  Requires:         %{name}-fuse%{?_isa} = %{version}-%{release} +%if ( 0%{?_with_fbextras:1} ) +Requires:         fb-libtirpc >= 0.2.5-1 +%endif  # self-heal daemon, rebalance, nfs-server etc. are actually clients  Requires:         %{name}-api%{?_isa} = %{version}-%{release}  Requires:         %{name}-client-xlators%{?_isa} = %{version}-%{release} @@ -600,7 +610,8 @@ export CFLAGS          %{?_without_ocf} \          %{?_without_rdma} \          %{?_without_syslog} \ -        %{?_without_tiering} +        %{?_without_tiering} \ +        %{?_with_fbextras}  # fix hardening and remove rpath in shlibs  %if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index c47fa3883c9..a7c96d1e7a0 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -1903,9 +1903,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,          switch (event) {          case RPC_CLNT_DISCONNECT: -                GF_LOG_OCCASIONALLY (log_ctr1, "glusterfsd-mgmt", GF_LOG_ERROR, -                        "failed to connect with remote-host: %s (%s)", -                        ctx->cmd_args.volfile_server, strerror (errno)); +                ctx->cmd_args.connect_attempts++; + +                gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, +                        "Connect attempt with remote-host: %s (%s) (%u/%d)", +                                ctx->cmd_args.volfile_server, +                                strerror (errno), +                                ctx->cmd_args.connect_attempts, +                                ctx->cmd_args.max_connect_attempts);                  if (!rpc->disabled) {                          /*                           * Check if dnscache is exhausted for current server @@ -1916,8 +1921,14 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,                                  break;                          }                  } + +               /* If we run out of servers, AND we attempted to connect +                * max connect times, then we should return ENOTCONN +                */                  server = ctx->cmd_args.curr_server; -                if (server->list.next == &ctx->cmd_args.volfile_servers) { +                if ((ctx->cmd_args.connect_attempts >= +                     ctx->cmd_args.max_connect_attempts) && +                     server->list.next == &ctx->cmd_args.volfile_servers) {                          if (!ctx->active)                                  need_term = 1;                          emval = ENOTCONN; @@ -1926,24 +1937,33 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,                                               "Exhausted all volfile servers");                          break;                  } -                server = list_entry (server->list.next, typeof(*server), list); -                ctx->cmd_args.curr_server = server; -                ctx->cmd_args.volfile_server = server->volfile_server; - -                ret = dict_set_str (rpc_trans->options, "remote-host", -                                    server->volfile_server); -                if (ret != 0) { -                        gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, -                                "failed to set remote-host: %s", + +                /* If we exceed the # of connect attempts, we should +                 * move onto the next server +                 */ +                if (ctx->cmd_args.connect_attempts >= +                    ctx->cmd_args.max_connect_attempts || !server) { +                        server = list_entry (server->list.next, +                                                typeof(*server), list); +                        ctx->cmd_args.curr_server = server; +                        ctx->cmd_args.volfile_server = server->volfile_server; + +                        ret = dict_set_str (rpc_trans->options, "remote-host", +                                                server->volfile_server); +                        if (ret != 0) { +                                gf_log ("glusterfsd-mgmt", GF_LOG_ERROR, +                                        "failed to set remote-host: %s", +                                        server->volfile_server); +                                if (!ctx->active) +                                        need_term = 1; +                                emval = ENOTCONN; +                                break; +                        } +                        ctx->cmd_args.connect_attempts = 0; +                        gf_log ("glusterfsd-mgmt", GF_LOG_INFO, +                                "connecting to next volfile server %s",                                  server->volfile_server); -                        if (!ctx->active) -                                need_term = 1; -                        emval = ENOTCONN; -                        break;                  } -                gf_log ("glusterfsd-mgmt", GF_LOG_INFO, -                        "connecting to next volfile server %s", -                        server->volfile_server);                  break;          case RPC_CLNT_CONNECT:                  rpc_clnt_set_connected (&((struct rpc_clnt*)ctx->mgmt)->conn); @@ -1960,7 +1980,7 @@ mgmt_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,                          }                  } - +                ctx->cmd_args.connect_attempts = 0;                  if (is_mgmt_rpc_reconnect)                          glusterfs_mgmt_pmap_signin (ctx); @@ -2120,6 +2140,7 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx)          int                     ret = -1;          int                     port = GF_DEFAULT_BASE_PORT;          char                    *host = NULL; +        char                    *addr_family = NULL;          cmd_args = &ctx->cmd_args;          GF_VALIDATE_OR_GOTO (THIS->name, cmd_args->volfile_server, out); @@ -2136,7 +2157,19 @@ glusterfs_mgmt_init (glusterfs_ctx_t *ctx)              !strcmp (cmd_args->volfile_server_transport, "unix")) {                  ret = rpc_transport_unix_options_build (&options, host, 0);          } else { -                ret = rpc_transport_inet_options_build (&options, host, port); +                xlator_cmdline_option_t *cmd_option = NULL; + +                list_for_each_entry (cmd_option, +                                     &cmd_args->xlator_options, cmd_args) { +                        if (!strcmp(cmd_option->key, +                                    "transport.address-family")) { +                                addr_family = cmd_option->value; +                                break; +                         } +                } + +                ret = rpc_transport_inet_options_build (&options, host, port, +                                                        addr_family);          }          if (ret)                  goto out; diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c index 6c7a7c883fa..5022cfc22da 100644 --- a/glusterfsd/src/glusterfsd.c +++ b/glusterfsd/src/glusterfsd.c @@ -986,7 +986,7 @@ parse_opts (int key, char *arg, struct argp_state *state)                  cmd_args->debug_mode = ENABLE_DEBUG_MODE;                  break;          case ARGP_VOLFILE_MAX_FETCH_ATTEMPTS: -                cmd_args->max_connect_attempts = 1; +                cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;                  break;          case ARGP_DIRECT_IO_MODE_KEY: @@ -1955,13 +1955,7 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)                  }          } -        /* -           This option was made obsolete but parsing it for backward -           compatibility with third party applications -         */ -        if (cmd_args->max_connect_attempts) { -                gf_msg ("glusterfs", GF_LOG_WARNING, 0, glusterfsd_msg_33); -        } +        cmd_args->max_connect_attempts = DEFAULT_MAX_CONNECT_ATTEMPTS;  #ifdef GF_DARWIN_HOST_OS          if (cmd_args->mount_point) diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h index e442bede5db..b5c6b27b534 100644 --- a/glusterfsd/src/glusterfsd.h +++ b/glusterfsd/src/glusterfsd.h @@ -16,7 +16,7 @@  #define DEFAULT_GLUSTERD_VOLFILE              CONFDIR "/glusterd.vol"  #define DEFAULT_CLIENT_VOLFILE                CONFDIR "/glusterfs.vol"  #define DEFAULT_SERVER_VOLFILE                CONFDIR "/glusterfsd.vol" - +#define DEFAULT_MAX_CONNECT_ATTEMPTS       200  #define DEFAULT_EVENT_POOL_SIZE            16384  #define ARGP_LOG_LEVEL_NONE_OPTION        "NONE" diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 18f445ae265..6a5889207d4 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -181,26 +181,16 @@ gf_rev_dns_lookup (const char *ip)  {          char               *fqdn = NULL;          int                ret  = 0; -        struct sockaddr_in sa   = {0}; -        char               host_addr[256] = {0, };          GF_VALIDATE_OR_GOTO ("resolver", ip, out); -        sa.sin_family = AF_INET; -        inet_pton (AF_INET, ip, &sa.sin_addr); -        ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr, -                          sizeof (host_addr), NULL, 0, 0); - +        /* Get the FQDN */ +        ret =  gf_get_hostname_from_ip ((char *)ip, &fqdn);          if (ret != 0) {                  gf_msg ("resolver", GF_LOG_INFO, errno,                          LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve "                          "hostname for %s", ip); -                goto out;          } - -        /* Get the FQDN */ -        fqdn = gf_strdup (host_addr); -  out:         return fqdn;  } @@ -3107,11 +3097,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)          char                    *client_ip_copy               = NULL;          char                    *tmp                          = NULL;          char                    *ip                           = NULL; +        size_t                   addr_sz                      = 0;          /* if ipv4, reverse lookup the hostname to           * allow FQDN based rpc authentication           */ -        if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) { +        if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) && +            !valid_ipv4_address (client_ip, strlen (client_ip), 0)) {                  /* most times, we get a.b.c.d:port form, so check that */                  client_ip_copy = gf_strdup (client_ip);                  if (!client_ip_copy) @@ -3124,12 +3116,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)          if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) {                  client_sockaddr = (struct sockaddr *)&client_sock_in; +                addr_sz = sizeof (client_sock_in);                  client_sock_in.sin_family = AF_INET;                  ret = inet_pton (AF_INET, ip,                                   (void *)&client_sock_in.sin_addr.s_addr);          } else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) {                  client_sockaddr = (struct sockaddr *) &client_sock_in6; +                addr_sz = sizeof (client_sock_in6);                  client_sock_in6.sin6_family = AF_INET6;                  ret = inet_pton (AF_INET6, ip, @@ -3143,8 +3137,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname)                  goto out;          } +        /* You cannot just use sizeof (*client_sockaddr), as per the man page +         * the (getnameinfo) size must be the size of the underlying sockaddr +         * struct e.g. sockaddr_in6 or sockaddr_in.  Failure to do so will +         * break IPv6 hostname resolution (IPv4 will work only because +         * the sockaddr_in struct happens to be of the correct size). +         */          ret = getnameinfo (client_sockaddr, -                           sizeof (*client_sockaddr), +                           addr_sz,                             client_hostname, sizeof (client_hostname),                             NULL, 0, 0);          if (ret) { diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h index fbaac76b9ee..771ed983d32 100644 --- a/libglusterfs/src/compat.h +++ b/libglusterfs/src/compat.h @@ -479,6 +479,12 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);  #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0);  #endif +#ifdef GF_BSD_HOST_OS +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME +#endif + +#ifndef IPV6_DEFAULT +  #ifndef IXDR_GET_LONG  #define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf))  #endif @@ -495,6 +501,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags);  #define IXDR_PUT_U_LONG(buf, v)       IXDR_PUT_LONG(buf, (long)(v))  #endif +#endif /* IPV6_DEFAULT */ +  #if defined(__GNUC__) && !defined(RELAX_POISONING)  /* Use run API, see run.h */  #include <stdlib.h> /* system(), mkostemp() */ diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c index 25ddff0d8c4..6a61e641e19 100644 --- a/libglusterfs/src/dict.c +++ b/libglusterfs/src/dict.c @@ -27,6 +27,45 @@  #include "statedump.h"  #include "libglusterfs-messages.h" +/* this goes with the bucket_size lookup table below */ +#define NUM_DISTINCT_SIZES_32_BIT 32 + +/* this bucket_size lookup table is borrowed from GNU libstdc++ */ +static const uint32_t bucket_sizes[NUM_DISTINCT_SIZES_32_BIT] = { +  /* 0  */ 5ul, +  /* 1  */ 11ul, +  /* 2  */ 23ul, +  /* 3  */ 47ul, +  /* 4  */ 97ul, +  /* 5  */ 199ul, +  /* 6  */ 409ul, +  /* 7  */ 823ul, +  /* 8  */ 1741ul, +  /* 9  */ 3469ul, +  /* 10 */ 6949ul, +  /* 11 */ 14033ul, +  /* 12 */ 28411ul, +  /* 13 */ 57557ul, +  /* 14 */ 116731ul, +  /* 15 */ 236897ul, +  /* 16 */ 480881ul, +  /* 17 */ 976369ul, +  /* 18 */ 1982627ul, +  /* 19 */ 4026031ul, +  /* 20 */ 8175383ul, +  /* 21 */ 16601593ul, +  /* 22 */ 33712729ul, +  /* 23 */ 68460391ul, +  /* 24 */ 139022417ul, +  /* 25 */ 282312799ul, +  /* 26 */ 573292817ul, +  /* 27 */ 1164186217ul, +  /* 28 */ 2364114217ul, +  /* 29 */ 4294967291ul, +  /* 30 */ 4294967291ul, +  /* 31 */ 4294967291ul, +}; +  struct dict_cmp {          dict_t *dict;          gf_boolean_t (*value_ignore) (char *k); @@ -47,7 +86,7 @@ get_new_data ()  }  dict_t * -get_new_dict_full (int size_hint) +get_new_dict_full (uint32_t size_hint)  {          dict_t *dict = mem_get0 (THIS->ctx->dict_pool); @@ -67,17 +106,8 @@ get_new_dict_full (int size_hint)                  dict->members = &dict->members_internal;          }          else { -                /* -                 * We actually need to allocate space for size_hint *pointers* -                 * but we actually allocate space for one *structure*.  Since -                 * a data_pair_t consists of five pointers, we're wasting four -                 * pointers' worth for N=1, and will overrun what we allocated -                 * for N>5.  If anybody ever starts using size_hint, we'll need -                 * to fix this. -                 */ -                GF_ASSERT (size_hint <= -                           (sizeof(data_pair_t) / sizeof(data_pair_t *))); -                dict->members = mem_get0 (THIS->ctx->dict_pair_pool); +                dict->members = GF_CALLOC (size_hint, sizeof (data_pair_t *), +                                           gf_common_mt_data_pair_t);                  if (!dict->members) {                          mem_put (dict);                          return NULL; @@ -108,6 +138,35 @@ dict_new (void)          return dict;  } +dict_t * +dict_new_by_size (uint32_t num) +{ +        int32_t highest_bit = 0; +        uint32_t bucket_size = 0; +        dict_t *dict = NULL; + +        if (num == 0) +                goto out; + +#ifdef _GNU_SOURCE +        highest_bit = 32 - __builtin_clz (num); +#else +        while (num != 0) { +                highest_bit++; +                num >>= 1; +        } +#endif + +        bucket_size = bucket_sizes[highest_bit - 1]; +        dict = get_new_dict_full (bucket_size); + +        if (dict) +                dict_ref (dict); + +out: +        return dict; +} +  int32_t  is_data_equal (data_t *one,                 data_t *two) @@ -268,7 +327,7 @@ err_out:  static data_pair_t *  dict_lookup_common (dict_t *this, char *key)  { -        int hashval = 0; +        uint32_t hashval = 0;          if (!this || !key) {                  gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL,                                    LG_MSG_INVALID_ARG, @@ -279,7 +338,7 @@ dict_lookup_common (dict_t *this, char *key)          /* If the divisor is 1, the modulo is always 0,           * in such case avoid hash calculation.           */ -        if (this->hash_size != 1) +        if (this->hash_size > 1)                  hashval = SuperFastHash (key, strlen (key)) % this->hash_size;          data_pair_t *pair; @@ -319,7 +378,7 @@ dict_lookup (dict_t *this, char *key, data_t **data)  static int32_t  dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)  { -        int hashval = 0; +        uint32_t hashval = 0;          data_pair_t *pair;          char key_free = 0;          int tmp = 0; @@ -336,7 +395,7 @@ dict_set_lk (dict_t *this, char *key, data_t *value, gf_boolean_t replace)          /* If the divisor is 1, the modulo is always 0,           * in such case avoid hash calculation.           */ -        if (this->hash_size != 1) { +        if (this->hash_size > 1) {                  tmp = SuperFastHash (key, strlen (key));                  hashval = (tmp % this->hash_size);          } @@ -478,7 +537,7 @@ dict_get (dict_t *this, char *key)  void  dict_del (dict_t *this, char *key)  { -        int hashval = 0; +        uint32_t hashval = 0;          if (!this || !key) {                  gf_msg_callingfn ("dict", GF_LOG_WARNING, EINVAL, @@ -491,7 +550,7 @@ dict_del (dict_t *this, char *key)          /* If the divisor is 1, the modulo is always 0,           * in such case avoid hash calculation.           */ -        if (this->hash_size != 1) +        if (this->hash_size > 1)                  hashval = SuperFastHash (key, strlen (key)) % this->hash_size;          data_pair_t *pair = this->members[hashval]; diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h index c5b82677e2e..1f6c1a0eae9 100644 --- a/libglusterfs/src/dict.h +++ b/libglusterfs/src/dict.h @@ -79,9 +79,9 @@ struct _data_pair {  struct _dict {          unsigned char   is_static:1; -        int32_t         hash_size; -        int32_t         count; -        int32_t         refcount; +        uint32_t         hash_size; +        uint32_t         count; +        uint32_t         refcount;          data_pair_t   **members;          data_pair_t    *members_list;          char           *extra_free; @@ -156,7 +156,7 @@ void *data_to_ptr (data_t *data);  data_t *get_new_data ();  data_t * data_copy (data_t *old); -dict_t *get_new_dict_full (int size_hint); +dict_t *get_new_dict_full (uint32_t size_hint);  dict_t *get_new_dict ();  int dict_foreach (dict_t *this, @@ -196,6 +196,7 @@ int dict_keys_join (void *value, int size, dict_t *dict,  /* CLEANED UP FUNCTIONS DECLARATIONS */  GF_MUST_CHECK dict_t *dict_new (void); +GF_MUST_CHECK dict_t *dict_new_by_size (uint32_t num);  dict_t *dict_copy_with_ref (dict_t *this, dict_t *new);  GF_MUST_CHECK int dict_reset (dict_t *dict); diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 6e2d370605b..399d695665b 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -330,6 +330,7 @@ struct _cmd_args {          uint32_t         log_buf_size;          uint32_t         log_flush_timeout;          int32_t          max_connect_attempts; +        unsigned int     connect_attempts;          char            *print_exports;          char            *print_netgroups;          /* advanced options */ diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c index 17cd68fc206..fa3ac840c43 100644 --- a/libglusterfs/src/iobuf.c +++ b/libglusterfs/src/iobuf.c @@ -30,8 +30,8 @@ struct iobuf_init_config gf_iobuf_init_config[] = {          {8 * 1024, 128},          {32 * 1024, 64},          {128 * 1024, 32}, -        {256 * 1024, 8}, -        {1 * 1024 * 1024, 2}, +        {256 * 1024, 64}, +        {1 * 1024 * 1024, 64},  };  int diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c index 611615949fa..3399cc7c297 100644 --- a/libglusterfs/src/latency.c +++ b/libglusterfs/src/latency.c @@ -21,6 +21,7 @@  #include "statedump.h"  #include "libglusterfs-messages.h" +static int gf_set_fop_from_fn_pointer_warning;  void  gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn)  { @@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void                  fop = GF_FOP_READDIRP;          else if (fops->getspec == *(fop_getspec_t *)&fn)                  fop = GF_FOP_GETSPEC; -        else -                fop = -1; +        else if (fops->ipc == *(fop_ipc_t *)&fn) +                fop = GF_FOP_IPC; +        else { +                fop = GF_FOP_NULL; +                GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning, +                                    "latency", +                                    GF_LOG_WARNING, +                                    "Unknown FOP type"); +        }          frame->op   = fop;  } diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c index 88fbdf58319..4d81ade8b60 100644 --- a/libglusterfs/src/mem-pool.c +++ b/libglusterfs/src/mem-pool.c @@ -454,6 +454,10 @@ mem_get0 (struct mem_pool *mem_pool)  void *  mem_get (struct mem_pool *mem_pool)  { +#ifdef DISABLE_MEMPOOL +          return GF_CALLOC (1, mem_pool->real_sizeof_type, +                                gf_common_mt_mem_pool); +#else          struct list_head *list = NULL;          void             *ptr = NULL;          int             *in_use = NULL; @@ -525,6 +529,7 @@ fwd_addr_out:          UNLOCK (&mem_pool->lock);          return ptr; +#endif /* DISABLE_MEMPOOL */  } @@ -551,6 +556,10 @@ __is_member (struct mem_pool *pool, void *ptr)  void  mem_put (void *ptr)  { +#ifdef DISABLE_MEMPOOL +        GF_FREE (ptr); +        return; +#else          struct list_head *list = NULL;          int    *in_use = NULL;          void   *head = NULL; @@ -628,6 +637,7 @@ mem_put (void *ptr)                  }          }          UNLOCK (&pool->lock); +#endif /* DISABLE_MEMPOOL */  }  void diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index afa52d8bc45..fc7bf9e5996 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -168,6 +168,7 @@ enum gf_common_mem_types_ {          /*lock migration*/          gf_common_mt_lock_mig,          gf_common_mt_pthread_t, +        gf_common_ping_local_t,          gf_common_mt_end  };  #endif diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c index f7b2bea2f30..903303d1380 100644 --- a/libglusterfs/src/timespec.c +++ b/libglusterfs/src/timespec.c @@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta)          ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000);          ts->tv_sec += delta.tv_sec;  } + +void timespec_sub (const struct timespec *begin, const struct timespec *end, +                   struct timespec *res) +{ +        if (end->tv_nsec < begin->tv_nsec) { +                res->tv_sec = end->tv_sec - begin->tv_sec - 1; +                res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec; +        } else { +                res->tv_sec = end->tv_sec - begin->tv_sec; +                res->tv_nsec = end->tv_nsec - begin->tv_nsec; +        } +} diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h index f37194b97cf..9c393ee7166 100644 --- a/libglusterfs/src/timespec.h +++ b/libglusterfs/src/timespec.h @@ -20,5 +20,8 @@  void timespec_now (struct timespec *ts);  void timespec_adjust_delta (struct timespec *ts, struct timespec delta); +void timespec_sub (const struct timespec *begin, +                   const struct timespec *end, +                   struct timespec *res);  #endif /*  __INCLUDE_TIMESPEC_H__ */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 3c1cde50fa0..b2529d3c4f7 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -117,6 +117,14 @@ out:  } +static const char *xlator_lib_path (void) +{ +        const char *libdir_env = getenv ("GLUSTER_LIBDIR"); + +        return libdir_env ? libdir_env : XLATORDIR; +} + +  int  xlator_volopt_dynload (char *xlator_type, void **dl_handle,                         volume_opt_list_t *opt_list) @@ -130,9 +138,11 @@ xlator_volopt_dynload (char *xlator_type, void **dl_handle,          /* socket.so doesn't fall under the default xlator directory, hence we           * need this check */          if (!strstr(xlator_type, "rpc-transport")) -                ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xlator_type); +                ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), +                                   xlator_type);          else -                ret = gf_asprintf (&name, "%s/%s.so", XLATORPARENTDIR, xlator_type); +                ret = gf_asprintf (&name, "%s/../%s.so", xlator_lib_path (), +                                   xlator_type);          if (-1 == ret) {                  goto out;          } @@ -183,7 +193,7 @@ xlator_dynload (xlator_t *xl)          INIT_LIST_HEAD (&xl->volume_options); -        ret = gf_asprintf (&name, "%s/%s.so", XLATORDIR, xl->type); +        ret = gf_asprintf (&name, "%s/%s.so", xlator_lib_path (), xl->type);          if (-1 == ret) {                  goto out;          } diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index 70e6f0a108d..2e04893c487 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -927,6 +927,7 @@ struct _xlator {          gf_loglevel_t    loglevel;   /* Log level for translator */ +        fop_latency_t client_latency;          /* for latency measurement */          fop_latency_t latencies[GF_FOP_MAXVALUE]; @@ -17,7 +17,7 @@ done  shift $((OPTIND-1)) -branch="release-3.8"; +branch="release-3.8-fb";  set_hooks_commit_msg()  { diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index a7ff866ac99..7ce066dec5f 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -18,6 +18,7 @@  #include "mem-pool.h"  #include "xdr-rpc.h"  #include "rpc-common-xdr.h" +#include "timespec.h"  char *clnt_ping_procs[GF_DUMP_MAXVALUE] = { @@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = {          .procnames = clnt_ping_procs,  }; +struct ping_local { +    struct rpc_clnt *rpc; +    struct timespec submit_time; +}; +  /* Must be called under conn->lock */  static int  __rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk) @@ -166,16 +172,48 @@ out:          return;  } +void +_update_client_latency (const rpc_clnt_connection_t *conn, +                        call_frame_t *frame, +                        uint64_t elapsed_usec) +{ +        fop_latency_t *lat; + +        lat = &frame->this->client_latency; + +        if (elapsed_usec < lat->min) { +                lat->min = elapsed_usec; +        } + +        if (elapsed_usec > lat->max) { +                lat->max = elapsed_usec; +        } + +        lat->total += elapsed_usec; +        lat->count++; +        lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count; +        gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, " +                "avg: %0.6lf ms, count:%ld", +                conn->trans->peerinfo.identifier, elapsed_usec / 1000.0, +                lat->mean / 1000.0, lat->count); +} +  int  rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,                     void *myframe)  { -        struct rpc_clnt       *rpc     = NULL; +        struct ping_local     *local   = NULL;          xlator_t              *this    = NULL;          rpc_clnt_connection_t *conn    = NULL; +          call_frame_t          *frame   = NULL;          struct timespec       timeout  = {0, }; +        struct timespec       now; +        struct timespec       delta; +        int64_t               latency_usec = 0; +        int                   ret = 0;          int                   unref    = 0; +        gf_boolean_t          call_notify = _gf_false;          if (!myframe) {                  gf_log (THIS->name, GF_LOG_WARNING, @@ -185,14 +223,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,          frame = myframe;          this = frame->this; -        rpc  = frame->local; -        frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */ -        conn = &rpc->conn; +        local = frame->local; +        conn = &local->rpc->conn;          pthread_mutex_lock (&conn->lock);          {                  if (req->rpc_status == -1) { -                        unref = rpc_clnt_remove_ping_timer_locked (rpc); +                        unref = rpc_clnt_remove_ping_timer_locked (local->rpc);                          if (unref) {                                  gf_log (this->name, GF_LOG_WARNING,                                          "socket or ib related error"); @@ -207,8 +244,15 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,                          goto unlock;                  } -                unref = rpc_clnt_remove_ping_timer_locked (rpc); -                if (__rpc_clnt_rearm_ping_timer (rpc, +                timespec_now (&now); +                timespec_sub (&local->submit_time, &now, &delta); +                latency_usec = delta.tv_sec * 1000000UL + +                               delta.tv_nsec / 1000UL; + +                _update_client_latency (conn, frame, latency_usec); +                call_notify = _gf_true; +                unref = rpc_clnt_remove_ping_timer_locked (local->rpc); +                if (__rpc_clnt_rearm_ping_timer (local->rpc,                                                   rpc_clnt_start_ping) == -1) {                          gf_log (this->name, GF_LOG_WARNING,                                  "failed to set the ping timer"); @@ -217,12 +261,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,          }  unlock:          pthread_mutex_unlock (&conn->lock); + +        if (call_notify) { +                ret = local->rpc->notifyfn (local->rpc, this, +                                            RPC_CLNT_PING, NULL); +                if (ret) { +                        gf_log (this->name, GF_LOG_WARNING, +                                "RPC_CLNT_PING notify failed"); +                } +        }  out:          if (unref) -                rpc_clnt_unref (rpc); +                rpc_clnt_unref (local->rpc); -        if (frame) +        if (frame) { +                GF_FREE (frame->local); +                frame->local = NULL;                  STACK_DESTROY (frame->root); +        }          return 0;  } @@ -232,18 +288,27 @@ rpc_clnt_ping (struct rpc_clnt *rpc)          call_frame_t *frame = NULL;          int32_t       ret   = -1;          rpc_clnt_connection_t *conn = NULL; +        struct ping_local *local = NULL;          conn = &rpc->conn; +        local = GF_MALLOC (sizeof(struct ping_local), gf_common_ping_local_t); +        if (!local) +                return ret;          frame = create_frame (THIS, THIS->ctx->pool); -        if (!frame) +        if (!frame) { +                GF_FREE (local);                  return ret; +        } -        frame->local = rpc; +        local->rpc = rpc; +        timespec_now (&local->submit_time); +        frame->local = local;          ret = rpc_clnt_submit (rpc, &clnt_ping_prog,                                 GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0,                                 NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL);          if (ret) { +                /* FIXME: should we free the frame here? Methinks so! */                  gf_log (THIS->name, GF_LOG_ERROR,                          "failed to start ping timer");          } diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index 3a5b287cd49..2ccaa56e4cb 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -19,6 +19,7 @@  typedef enum {          RPC_CLNT_CONNECT,          RPC_CLNT_DISCONNECT, +        RPC_CLNT_PING,          RPC_CLNT_MSG,          RPC_CLNT_DESTROY  } rpc_clnt_event_t; diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c index e224dcc022e..5556740ca81 100644 --- a/rpc/rpc-lib/src/rpc-transport.c +++ b/rpc/rpc-lib/src/rpc-transport.c @@ -166,6 +166,19 @@ out: +int rpc_transport_lib_path (char **name, char *type) +{ +        int                ret = -1; +        char               *libdir_env = getenv ("GLUSTER_LIBDIR"); + +        ret = libdir_env == NULL +            ? gf_asprintf (name, "%s/%s.so", RPC_TRANSPORTDIR, type) +            : gf_asprintf (name, "%s/rpc-transport/%s.so", libdir_env, type); +        return ret; +} + + +  rpc_transport_t *  rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)  { @@ -274,7 +287,7 @@ rpc_transport_load (glusterfs_ctx_t *ctx, dict_t *options, char *trans_name)  		goto fail;  	} -	ret = gf_asprintf (&name, "%s/%s.so", RPC_TRANSPORTDIR, type); +        ret = rpc_transport_lib_path (&name, type);          if (-1 == ret) {                  goto fail;          } @@ -652,18 +665,37 @@ out:          return ret;  } +/** @brief build a dictionary containing basic transport options. + * + * @param[out] options: will be set to a newly created dictionary on success. + * @param[in]  hostname: desired target hostname. + * @param[in]  port: desired target port. + * @param[in]  addr_family (optional): desired address family. If NULL, + *             default will be used. + * + * @returns zero on success. + */  int  rpc_transport_inet_options_build (dict_t **options, const char *hostname, -                                  int port) +                                  int port, const char *addr_family)  {          dict_t          *dict = NULL;          char            *host = NULL;          int             ret = -1; +#ifdef IPV6_DEFAULT +        const char      *addr_family_default = "inet6"; +#else +        const char      *addr_family_default = "inet"; +#endif          GF_ASSERT (options);          GF_ASSERT (hostname);          GF_ASSERT (port >= 1024); +        if (!addr_family) { +                addr_family = addr_family_default; +        } +          dict = dict_new ();          if (!dict)                  goto out; @@ -688,6 +720,14 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname,                  goto out;          } +        ret = dict_set_str (dict, "transport.address-family", +                            (char *)addr_family); +        if (ret) { +                gf_log (THIS->name, GF_LOG_WARNING, +                        "failed to set address-family to %s", addr_family); +                goto out; +        } +          ret = dict_set_str (dict, "transport-type", "socket");          if (ret) {                  gf_log (THIS->name, GF_LOG_WARNING, diff --git a/rpc/rpc-lib/src/rpc-transport.h b/rpc/rpc-lib/src/rpc-transport.h index f0add065065..0f555462ea4 100644 --- a/rpc/rpc-lib/src/rpc-transport.h +++ b/rpc/rpc-lib/src/rpc-transport.h @@ -311,5 +311,6 @@ rpc_transport_unix_options_build (dict_t **options, char *filepath,                                    int frame_timeout);  int -rpc_transport_inet_options_build (dict_t **options, const char *hostname, int port); +rpc_transport_inet_options_build (dict_t **options, const char *hostname, +                                  int port, const char *addr_family);  #endif /* __RPC_TRANSPORT_H__ */ diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c index f07e745a4b3..9dc3bff427c 100644 --- a/rpc/rpc-lib/src/rpcsvc.c +++ b/rpc/rpc-lib/src/rpcsvc.c @@ -37,6 +37,10 @@  #include <stdarg.h>  #include <stdio.h> +#ifdef IPV6_DEFAULT +#include <netconfig.h> +#endif +  #include "xdr-rpcclnt.h"  #include "glusterfs-acl.h" @@ -1363,6 +1367,82 @@ rpcsvc_error_reply (rpcsvc_request_t *req)          return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL);  } +#ifdef IPV6_DEFAULT +int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port) +{ +        const int IP_BUF_LEN = 64; +        char addr_buf[IP_BUF_LEN]; + +        int err = 0; +        bool_t success = 0; +        struct netconfig *nc; +        struct netbuf *nb; + +        if (!newprog) { +                goto out; +        } + +        nc = getnetconfigent ("tcp6"); +        if (!nc) { +                err = -1; +                goto out; +        } + + +        err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff, +                       port & 0xff); +        if (err < 0) { +                err = -1; +                goto out; +        } + +        nb = uaddr2taddr (nc, addr_buf); +        if (!nb) { +                err = -1; +                goto out; +        } + +        success = rpcb_set (newprog->prognum, newprog->progver, nc, nb); +        if (!success) { +                gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6" +                                                 " service with rpcbind"); +        } + +        err = 0; + +out: +        return err; +} + +int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog) +{ +        int err = 0; +        bool_t success = 0; +        struct netconfig *nc; + +        if (!newprog) { +                goto out; +        } + +        nc = getnetconfigent ("tcp6"); +        if (!nc) { +                err = -1; +                goto out; +        } + +        success = rpcb_unset (newprog->prognum, newprog->progver, nc); +        if (!success) { +                gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6" +                                                 " service with rpcbind"); +        } + +        err = 0; +out: +        return err; +} +#endif  /* Register the program with the local portmapper service. */  int @@ -1527,7 +1607,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program)                          " program failed");                  goto out;          } - +#ifdef IPV6_DEFAULT +        ret = rpcsvc_program_unregister_rpcbind6 (program); +        if (ret == -1) { +                gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)" +                        " unregistration of program failed"); +                goto out; +        } +#endif          pthread_mutex_lock (&svc->rpclock);          {                  list_for_each_entry (prog, &svc->programs, program) { diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h index 02e467e68a7..1032df03b0e 100644 --- a/rpc/rpc-lib/src/rpcsvc.h +++ b/rpc/rpc-lib/src/rpcsvc.h @@ -437,6 +437,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener);  extern int  rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port); +#ifdef IPV6_DEFAULT +extern int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port); +#endif +  extern int  rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog); diff --git a/rpc/rpc-transport/rdma/src/name.c b/rpc/rpc-transport/rdma/src/name.c index 8003b1c87a0..b9d3269eb73 100644 --- a/rpc/rpc-transport/rdma/src/name.c +++ b/rpc/rpc-transport/rdma/src/name.c @@ -54,6 +54,10 @@ af_inet_bind_to_port_lt_ceiling (struct rdma_cm_id *cm_id,                                   struct sockaddr *sockaddr,                                   socklen_t sockaddr_len, uint32_t ceiling)  { +#if defined(NO_PRIVPORT) +        _assign_port(sockaddr, 0); +        return rdma_bind_addr (cm_id, sockaddr); +#else          int32_t        ret        = -1;          uint16_t      port        = ceiling - 1;          gf_boolean_t  ports[GF_PORT_MAX]; @@ -100,6 +104,7 @@ loop:          }          return ret; +#endif /* NO_PRIVPORT */  }  #if 0 diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c index 0e34dc211fe..cab4161c076 100644 --- a/rpc/rpc-transport/socket/src/name.c +++ b/rpc/rpc-transport/socket/src/name.c @@ -42,6 +42,10 @@ static int32_t  af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr,                                   socklen_t sockaddr_len, uint32_t ceiling)  { +#if defined(NO_PRIVPORT) +        _assign_port(sockaddr, 0); +        return bind (fd, sockaddr, sockaddr_len); +#else          int32_t        ret        = -1;          uint16_t      port        = ceiling - 1;          gf_boolean_t  ports[GF_PORT_MAX]; @@ -88,6 +92,7 @@ loop:          }          return ret; +#endif /* NO_PRIVPORT */  }  static int32_t @@ -557,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)          data_t  *address_family_data = NULL;          int32_t  ret                 = -1; +#ifdef IPV6_DEFAULT +        char *addr_family            = "inet6"; +        sa_family_t default_family   = AF_INET6; +#else +        char *addr_family            = "inet"; +        sa_family_t default_family   = AF_INET; +#endif +          GF_VALIDATE_OR_GOTO ("socket", sa_family, out);          address_family_data = dict_get (this->options, @@ -581,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family)                  }          } else {                  gf_log (this->name, GF_LOG_DEBUG, -                        "option address-family not specified, defaulting to inet"); -                *sa_family = AF_INET; +                        "option address-family not specified, " +                        "defaulting to %s", addr_family); +                *sa_family = default_family;          }          ret = 0; diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index ae551dcfae7..76609fbbc7a 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -55,7 +55,11 @@  /* TBD: do automake substitutions etc. (ick) to set these. */  #if !defined(DEFAULT_ETC_SSL)  #  ifdef GF_LINUX_HOST_OS +#   ifdef GF_FBEXTRAS +#    define DEFAULT_ETC_SSL "/var/lib/glusterd/ssl" +#   else  #    define DEFAULT_ETC_SSL "/etc/ssl" +#   endif  #  endif  #  ifdef GF_BSD_HOST_OS  #    define DEFAULT_ETC_SSL "/etc/openssl" @@ -866,7 +870,7 @@ __socket_keepalive (int fd, int family, int keepalive_intvl,                  goto err;          }  #else -        if (family != AF_INET) +        if (family != AF_INET && family != AF_INET6)                  goto done;          ret = setsockopt (fd, IPPROTO_TCP, TCP_KEEPIDLE, &keepalive_idle, @@ -3009,6 +3013,21 @@ socket_connect (rpc_transport_t *this, int port)                          }                  } +                /* Make sure we are not vulnerable to someone setting +                 * net.ipv6.bindv6only to 1 so that gluster services are +                 * avalable over IPv4 & IPv6. +                 */ +                int disable_v6only = 0; + +                if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY, +                                (void *)&disable_v6only, +                                sizeof (disable_v6only)) < 0) { +                        gf_log (this->name, GF_LOG_WARNING, +                               "Error disabling sockopt IPV6_V6ONLY: \"%s\"", +                               strerror (errno)); +                } + +                  if (priv->nodelay && (sa_family != AF_UNIX)) {                          ret = __socket_nodelay (priv->sock); diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x index 8462dcc258a..5ec8109d828 100644 --- a/rpc/xdr/src/glusterfs-fops.x +++ b/rpc/xdr/src/glusterfs-fops.x @@ -84,6 +84,7 @@ enum glusterfs_event_t {          GF_EVENT_UPCALL,          GF_EVENT_SCRUB_STATUS,          GF_EVENT_SOME_CHILD_DOWN, +        GF_EVENT_CHILD_PING,          GF_EVENT_MAXVAL  }; diff --git a/run-tests.sh b/run-tests.sh index 1487f30d832..866ab0464b4 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -182,12 +182,14 @@ function get_test_status ()      # for later. Why does the key have the distro and version then?      # Because changing the key in all test files would be very big process      # updating just this function with a better logic much simpler. +    # +    # FB Edit: For FB tests we are disabling NetBSD testing. +    #      Linux)          result=$(grep -e "^#G_TESTDEF_TEST_STATUS_CENTOS6" $test_name | \                   awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;;      NetBSD) -        result=$(grep -e "^#G_TESTDEF_TEST_STATUS_NETBSD7" $test_name | \ -                 awk -F"," {'print $1'} | awk -F"=" {'print $2'}) ;; +        result="KNOWN_ISSUE" ;;      *)          result="ENABLED" ;;      esac diff --git a/tests/basic/accept-v6v4.t b/tests/basic/accept-v6v4.t new file mode 100644 index 00000000000..7128c12c6be --- /dev/null +++ b/tests/basic/accept-v6v4.t @@ -0,0 +1,122 @@ +#!/bin/bash + +. $(dirname $0)/../nfs.rc + +# +# This test ensures that GlusterFS provides NFS, Mount and its Management daemon +# over both IPv4 and IPv6. It uses netcat to check the services running on both +# IPv4 & IPv6 addresses as well as a mount to test that mount & nfs work. +# + +IPV4_SUPPORT=false +IPV6_SUPPORT=false + +host $HOSTNAME | grep -q "has address" && IPV4_SUPPORT=true +host $HOSTNAME | grep -q "has IPv6 address" && IPV6_SUPPORT=true + +. $(dirname $0)/../include.rc + +cleanup; + +mkdir -p $B0/b{0,1,2} + +# make sure no registered rpcbind services are running +service rpcbind restart + +TEST glusterd +TEST pidof glusterd + +TEST $CLI vol create $V0 replica 3 $H0:$B0/b0 $H0:$B0/b1 $H0:$B0/b2 + +TEST $CLI vol set $V0 cluster.self-heal-daemon off +TEST $CLI vol set $V0 nfs.disable off +TEST $CLI vol set $V0 cluster.choose-local off +TEST $CLI vol start $V0 + +MOUNTD_PORT=38465 +MGMTD_PORT=24007 +NFSD_PORT=2049 + +function check_ip_port { +        ip=$1 +        port=$2 +        type=$3 + +        nc_flags="" +        if [ "$type" == "v6" ] && [ "$ip" == "NONE" ]; then +          echo "Y" +          return +        else +          nc_flags="-6" +        fi + +        if [ "$type" == "v4" ] && [ "$ip" == "NONE" ]; then +          echo "Y" +          return +        fi + +        if exec 3<>/dev/tcp/$ip/$port; then +          echo "Y" +        else +          echo "N" +        fi +} + +function check_nfs { +        ip=$1 +        type=$2 + +        if [ "$ip" == "NONE" ]; then +          echo "Y" +          return +        fi + +        if [ "$type" == "v6" ]; then +          addr="[$ip]" +        else +          addr="$ip" +        fi + +        if mount_nfs $addr:/$V0 $N0; then +          umount_nfs $N0 +          echo "Y" +        else +          echo "N" +        fi +} + +if [ ! $IPV4_SUPPORT ] && [ ! $IPV6_SUPPORT ]; then +  exit 1 +fi + +# Get the V4 & V6 addresses of this host +if $IPV4_SUPPORT; then +  V4=$(host $HOSTNAME | head -n1 | awk -F ' ' '{print $4}') +else +  V4="NONE" +fi + +if $IPV6_SUPPORT; then +  V6=$(host $HOSTNAME | tail -n1 | awk -F ' ' '{print $5}') +else +  V6="NONE" +fi + +# First check the management daemon +EXPECT "Y" check_ip_port $V6 $MGMTD_PORT "v6" +EXPECT "Y" check_ip_port $V4 $MGMTD_PORT "v4" + +# Give the MOUNT/NFS Daemon some time to start up +sleep 4 + +EXPECT "Y" check_ip_port $V4 $MOUNTD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $MOUNTD_PORT "v4" + +EXPECT "Y" check_ip_port $V4 $NFSD_PORT "v6" +EXPECT "Y" check_ip_port $V6 $NFSD_PORT "v4" + +# Mount the file system +EXPECT "Y" check_nfs $V6 "v6" +EXPECT "Y" check_nfs $V4 "v4" + +cleanup; diff --git a/tests/basic/dht-min-free-space.t b/tests/basic/dht-min-free-space.t new file mode 100755 index 00000000000..17d10cc39a5 --- /dev/null +++ b/tests/basic/dht-min-free-space.t @@ -0,0 +1,78 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +grep $B0/patchy1 /proc/mounts &> /dev/null && umount $B0/patchy1 +grep $B0/patchy2 /proc/mounts &> /dev/null && umount $B0/patchy2 +losetup -d /dev/loop0 2> /dev/null +losetup -d /dev/loop1 2> /dev/null +mkdir $B0/${V0}{1..2} + +TEST glusterd + +TEST dd if=/dev/zero of=/tmp/${V0}-dev1 bs=1M count=30 +TEST dd if=/dev/zero of=/tmp/${V0}-dev2 bs=1M count=30 + +TEST losetup /dev/loop0 /tmp/${V0}-dev1 +TEST losetup /dev/loop1 /tmp/${V0}-dev2 + +TEST mkfs.xfs /dev/loop0 +TEST mkfs.xfs /dev/loop1 + +TEST mount /dev/loop0 $B0/${V0}1 +TEST mount /dev/loop1 $B0/${V0}2 + +TEST $CLI volume create $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}2 +TEST $CLI volume set $V0 cluster.min-free-disk 2MB +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 0 +TEST $CLI volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +#################################### +# Test re-directs of file creation # +#################################### + +# This should work, no redirects +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=8 +TEST [ -f /d/backends/${V0}2/testfile1 ] && [ ! -k /d/backends/${V0}1/testfile1 ] + +TEST $CLI volume set $V0 cluster.min-free-disk 19MB + +# This should work, & the file redirected +# Subvolume 2 should have the linkto & +# Subvolume 1 should have the original +TEST dd if=/dev/zero of=$M0/testfile3 bs=1M count=4 +TEST [ -f /d/backends/${V0}1/testfile3 ] && [ ! -k /d/backends/${V0}1/testfile3 ] +TEST [ -k /d/backends/${V0}2/testfile3 ] + +# This should fail, cluster is full +TEST ! dd if=/dev/zero of=$M0/testfile2 bs=1M count=23 + +################### +# Strict mode off # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode off +TEST dd if=/dev/zero of=$M0/testfile1 bs=1M count=20 +TEST rm -f $M0/testfile1 + +################### +# Strict mode on # +################### +TEST $CLI volume set $V0 cluster.min-free-strict-mode on +TEST ! dd if=/dev/zero of=$M0/testfile1 bs=1M count=16 +TEST rm -f $M0/testfile1 + +killall gluster{fs,fsd,d} + +umount -lf $B0/${V0}1 +umount -lf $B0/${V0}2 + +losetup -d /dev/loop0 +losetup -d /dev/loop1 + +cleanup; diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common index 83c4463a912..152e3b51236 100644 --- a/tests/basic/ec/ec-common +++ b/tests/basic/ec/ec-common @@ -45,7 +45,7 @@ for size in $SIZE_LIST; do      eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }')  done -TEST df -h +TEST df -h $M0  TEST stat $M0  for idx in `seq 0 $LAST_BRICK`; do diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 98dd9232c73..3e3467535fb 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -136,7 +136,7 @@ TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024  cs=$(sha1sum $tmp/test | awk '{ print $1 }') -TEST df -h +TEST df -h $M0  TEST stat $M0  for idx in {0..5}; do diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t index fdaf9c2822e..da88bbcb2cc 100644 --- a/tests/basic/exports_parsing.t +++ b/tests/basic/exports_parsing.t @@ -32,7 +32,20 @@ function test_bad_opt ()          glusterfsd --print-exports $1 2>&1 | sed -n 1p  } -EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports +function check_export_line() { +  if [ "$1" == "$2" ]; then +     echo "Y" +  else +    echo "N" +  fi +  return +} + +export_result=$(test_good_file $EXP_FILES/exports) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result" + +export_result=$(test_good_file $EXP_FILES/exports-v6) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result"  EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports  EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports diff --git a/tests/basic/fop-sampling.t b/tests/basic/fop-sampling.t index cea8aa737c0..713c7e27579 100644 --- a/tests/basic/fop-sampling.t +++ b/tests/basic/fop-sampling.t @@ -2,13 +2,27 @@  #  . $(dirname $0)/../include.rc +. $(dirname $0)/../nfs.rc  . $(dirname $0)/../volume.rc -SAMPLE_FILE="$(gluster --print-logdir)/samples/glusterfs_${V0}.samp" +BRICK_SAMPLES="$(gluster --print-logdir)/samples/glusterfsd__d_backends_${V0}0.samp" +NFS_SAMPLES="$(gluster --print-logdir)/samples/glusterfs_nfsd.samp" + +function check_path { +        op=$1 +        path=$2 +        file=$3 +        grep $op $file | awk -F, '{print $11}' | grep $path 2>&1 > /dev/null +        if [ $? -eq 0 ]; then +          echo "Y" +        else +          echo "N" +        fi +}  function print_cnt() {    local FOP_TYPE=$1 -  local FOP_CNT=$(grep ,${FOP_TYPE} ${SAMPLE_FILE} | wc -l) +  local FOP_CNT=$(grep ,${FOP_TYPE} ${BRICK_SAMPLES} | wc -l)    echo $FOP_CNT  } @@ -42,12 +56,18 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}  TEST $CLI volume set $V0 nfs.disable off  TEST $CLI volume set $V0 diagnostics.latency-measurement on  TEST $CLI volume set $V0 diagnostics.count-fop-hits on -TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 5  TEST $CLI volume set $V0 diagnostics.fop-sample-buf-size 65535  TEST $CLI volume set $V0 diagnostics.fop-sample-interval 1  TEST $CLI volume set $V0 diagnostics.stats-dnscache-ttl-sec 3600 -  TEST $CLI volume start $V0 + +>${NFS_SAMPLES} +>${BRICK_SAMPLES} + +################# +# Basic Samples # +#################  TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0  for i in {1..5} @@ -58,4 +78,52 @@ done  TEST ls -l $M0  EXPECT_WITHIN 6 "OK" check_samples -cleanup +sleep 2 + +################################ +# Paths in the samples # +################################ + +TEST mount_nfs $H0:$V0 $N0 + +ls $N0 &> /dev/null +touch $N0/file1 +stat $N0/file1 &> /dev/null +echo "some data" > $N0/file1 +dd if=/dev/zero of=$N0/file2 bs=1M count=10 conv=fsync +dd if=/dev/zero of=$N0/file1 bs=1M count=1 +cat $N0/file2 &> /dev/null +mkdir -p $N0/dir1 +rmdir $N0/dir1 +rm $N0/file1 +rm $N0/file2 + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FINODELK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ENTRYLK / $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $BRICK_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $BRICK_SAMPLES + + +EXPECT_WITHIN 10 "Y" check_path CREATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path LOOKUP /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path SETATTR /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path WRITE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path FLUSH /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path ACCESS /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path READ /file2 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path TRUNCATE /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path MKDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path RMDIR /dir1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file1 $NFS_SAMPLES +EXPECT_WITHIN 10 "Y" check_path UNLINK /file2 $NFS_SAMPLES + +cleanup; diff --git a/tests/basic/fops-sanity-gfproxy.t b/tests/basic/fops-sanity-gfproxy.t new file mode 100755 index 00000000000..b3bb8a502cc --- /dev/null +++ b/tests/basic/fops-sanity-gfproxy.t @@ -0,0 +1,32 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/brick1; +EXPECT 'Created' volinfo_field $V0 'Status'; + +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +#gfproxy server +TEST glusterfs --volfile-id=gfproxy/$V0 --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy.log + +#mount on a random dir +TEST glusterfs --entry-timeout=3600 --attribute-timeout=3600 -s $H0 --volfile-id=gfproxy-client/$V0 $M0 --direct-io-mode=yes +TEST grep gfproxy-client /proc/mounts + +build_tester $(dirname $0)/fops-sanity.c + +TEST cp $(dirname $0)/fops-sanity $M0 +cd $M0 +TEST ./fops-sanity $V0 +cd - +rm -f $(dirname $0)/fops-sanity + +cleanup; diff --git a/tests/basic/gfproxy.t b/tests/basic/gfproxy.t new file mode 100644 index 00000000000..71c6788db76 --- /dev/null +++ b/tests/basic/gfproxy.t @@ -0,0 +1,74 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../nfs.rc + +cleanup; + +function start_gfproxyd { +        glusterfs --volfile-id=gfproxy/${V0} --volfile-server=$H0  -l /var/log/glusterfs/${V0}-gfproxy.log +} + +function restart_gfproxyd { +        pkill -f gfproxy/${V0} +        start_gfproxyd +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 config.gfproxyd-remote-host $H0 +TEST $CLI volume start $V0 + +sleep 2 + +REGULAR_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-fuse.vol" +GFPROXY_CLIENT_VOLFILE="/var/lib/glusterd/vols/${V0}/trusted-${V0}.tcp-gfproxy-fuse.vol" +GFPROXYD_VOLFILE="/var/lib/glusterd/vols/${V0}/${V0}.gfproxyd.vol" + +# Client volfile must exist +TEST [ -f $GFPROXY_CLIENT_VOLFILE ] + +# AHA & write-behind translators must exist +TEST grep "cluster/aha"  $GFPROXY_CLIENT_VOLFILE +TEST grep "performance/write-behind" $GFPROXY_CLIENT_VOLFILE + +# Make sure we didn't screw up the existing client +TEST grep "performance/write-behind" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/replicate" $REGULAR_CLIENT_VOLFILE +TEST grep "cluster/distribute" $REGULAR_CLIENT_VOLFILE + +TEST [ -f $GFPROXYD_VOLFILE ] + +TEST grep "cluster/replicate" $GFPROXYD_VOLFILE +TEST grep "cluster/distribute" $GFPROXYD_VOLFILE + +# AHA & write-behind must *not* exist +TEST ! grep "cluster/aha"  $GFPROXYD_VOLFILE +TEST ! grep "performance/write-behind" $GFPROXYD_VOLFILE + +# Test that we can start the server and the client +TEST start_gfproxyd +TEST glusterfs --volfile-id=gfproxy-client/${V0} --volfile-server=$H0 -l /var/log/glusterfs/${V0}-gfproxy-client.log $M0 +sleep 2 +TEST grep gfproxy-client/${V0} /proc/mounts + +# Write data to the mount and checksum it +TEST dd if=/dev/urandom bs=1M count=10 of=/tmp/testfile1 +md5=$(md5sum /tmp/testfile1 | awk '{print $1}') +TEST cp -v /tmp/testfile1 $M0/testfile1 +TEST [ "$(md5sum $M0/testfile1 | awk '{print $1}')" == "$md5" ] + +rm /tmp/testfile1 + +dd if=/dev/zero of=$N0/bigfile bs=1M count=3072 & +BG_STRESS_PID=$! + +sleep 3 + +restart_gfproxyd + +TEST wait $BG_STRESS_PID + +cleanup; diff --git a/tests/basic/glusterd/volfile_server_switch.t b/tests/basic/glusterd/volfile_server_switch.t index 0b0e6470244..0b01398215c 100644 --- a/tests/basic/glusterd/volfile_server_switch.t +++ b/tests/basic/glusterd/volfile_server_switch.t @@ -1,5 +1,8 @@  #!/bin/bash +#G_TESTDEF_TEST_STATUS_CENTOS6=KNOWN_ISSUE,BUG=000000 +#G_TESTDEF_TEST_STATUS_NETBSD7=KNOWN_ISSUE,BUG=000000 +  . $(dirname $0)/../../include.rc  . $(dirname $0)/../../volume.rc  . $(dirname $0)/../../cluster.rc diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t new file mode 100644 index 00000000000..f3655eaef3b --- /dev/null +++ b/tests/basic/halo-failover-disabled.t @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +#    brick immediatelly, and md5s will show they are equal once +#    the write completes. +# 4. The mount should also be RW after the brick is killed as +#    quorum will be immediately restored by swapping in the +#    other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.halo-failover-enabled off +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG + +# Use a large ping time here so the spare brick is not marked up +# based on the ping time.  The only way it can get marked up is +# by being swapped in via the down event (which is what we are disabling). +TEST $CLI volume set $V0 network.ping-timeout 1000 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +UP_IDX=$(cat /var/log/glusterfs/$M0LOG  | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX} + +# Make sure two children are down and one is up. +EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3 + +# Test that quorum should fail and the mount is RO, the reason here +# is that although there _is_ another brick running which _could_ +# take the failed bricks place, it is not marked "up" so quorum +# will not be fullfilled.  If we waited 1000 second the brick would +# indeed be activated based on ping time, but for our test we want +# the decision to be solely "down event" driven, not ping driven. +TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX + +# Test that quorum should be restored and the file is writable +TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 + +cleanup diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t new file mode 100644 index 00000000000..2dddf9951fa --- /dev/null +++ b/tests/basic/halo-failover-enabled.t @@ -0,0 +1,87 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +#    brick immediatelly, and md5s will show they are equal once +#    the write completes. +# 4. The mount should also be RW after the brick is killed as +#    quorum will be immediately restored by swapping in the +#    other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-failover-enabled on +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 network.ping-timeout 20 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG +TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI volume set $V0 nfs.log-level DEBUG +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + +# Write some data to the mount +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") +TEST [ -n "$KILL_IDX" ] +# NB: UP_CHILDREN is the set of children that should be up after we kill +# the brick indicated by KILL_IDX, *not* the set of children which are +# currently up! +UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g")) +UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)" +UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)" +VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)" + +# The victim brick should have a copy of the file. +TEST [ -n "$VICTIM_HAS_TEST" ] + +# Of the bricks which will remain standing, there should be only one +# brick which has the file called test.  If the both have the first +# test file, the test is invalid as all the bricks are up and the +# halo-max-replicas is not being honored; e.g. bug exists. +ONLY_ONE=$((([ -z "$UP2_HAS_TEST" ] || [ -z "$UP1_HAS_TEST" ]) && +  ([ -n "$UP2_HAS_TEST" ] || [ -n "$UP1_HAS_TEST" ])) && echo true) +TEST [ "x$ONLY_ONE" == "xtrue" ] + +echo "Failing child ${KILL_IDX}..." +TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX} + +# Test the mount is still RW (i.e. quorum works) +TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync + +# Calulate the MD5s +MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1) +MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1) + +# Verify the two up bricks have identical MD5s, if both are identical +# then we must have successfully failed-over to the brick which was +# previously proven to be down (via the ONLY_ONE test). +TEST [ "$MD5_UP1" == "$MD5_UP2" ] + +cleanup diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t new file mode 100644 index 00000000000..4574fdfe41e --- /dev/null +++ b/tests/basic/halo-hybrid.t @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Test for the Halo hybrid feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +#    heal daemon is off to start. +# 2. Write some data +# 3. Verify hybrid code chose children for lookups +# 4. Verify hybrid code chose child for reads +# 5. Verify hybrid code wrote synchronously to all replicas +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function found_fuse_log_msg { +  local dir="$1" +  local msg="$2" +  local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l) +  if (( $cnt == 1 )); then +    echo "Y" +  else +    echo "N" +  fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-hybrid-mode True +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 diagnostics.client-log-level TRACE +TEST $CLI volume start $V0 + +# Start a synchronous mount +TEST glusterfs --volfile-id=/$V0 \ +  --xlator-option *replicate*.halo-max-latency=9999  \ +  --volfile-server=$H0 $M0 \ +  --attribute-timeout=0 --entry-timeout=0 +sleep 2 +cd $M0 + +TEST mkdir testdir +TEST cd testdir +for i in {1..5} +do +        dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null +done +TEST ls -l + +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs" +EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child" + +B0_CNT=$(ls $B0/${V0}0/testdir | wc -l) +B1_CNT=$(ls $B0/${V0}1/testdir | wc -l) +B2_CNT=$(ls $B0/${V0}2/testdir | wc -l) + +# Writes should be synchronous, all should have same +# file count +TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))" + +cleanup diff --git a/tests/basic/halo.t b/tests/basic/halo.t new file mode 100644 index 00000000000..25aca3442ab --- /dev/null +++ b/tests/basic/halo.t @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Test for the Halo geo-replication feature +# +# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients, +#    heal daemon is off to start. +# 2. Write some data +# 3. Verify at least one of the bricks did not receive the writes. +# 4. Turn the heal daemon on +# 5. Within 30 seconds the SHD should async heal the data over +#    to the 3rd brick. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.eager-lock off +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +for i in {1..5} +do +        dd if=/dev/urandom of=f bs=1M count=1 2>/dev/null +        mkdir a; cd a; +done + +B0_CNT=$(ls $B0/${V0}0 | wc -l) +B1_CNT=$(ls $B0/${V0}1 | wc -l) +B2_CNT=$(ls $B0/${V0}2 | wc -l) + +# One of the brick dirs should be empty +TEST "(($B0_CNT == 0 || $B1_CNT == 0 || $B2_CNT == 0))" + +# Ok, turn the heal daemon on and verify it heals it up +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN 30 "0" get_pending_heal_count $V0 +cleanup diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t index 9df5cb45c3b..99f032cbd44 100755 --- a/tests/basic/mount-nfs-auth.t +++ b/tests/basic/mount-nfs-auth.t @@ -15,6 +15,9 @@ TEST glusterd  TEST pidof glusterd  TEST $CLI volume info +H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1) +H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}') +  # Export variables for allow & deny  EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)"  EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" @@ -37,6 +40,10 @@ function build_dirs () {          mkdir -p $B0/b{0,1,2}/L1/L2/L3  } +function export_allow_this_host_ipv6 () { +        printf "$EXPORT_ALLOW6\n" > ${NFSDIR}/exports +} +  function export_allow_this_host () {          printf "$EXPORT_ALLOW\n" > ${NFSDIR}/exports  } @@ -150,10 +157,7 @@ setup_cluster  TEST $CLI vol set $V0 nfs.disable off  TEST $CLI vol start $V0 -# Get NFS state directory -NFSDIR=$( $CLI volume get patchy nfs.mount-rmtab | \ -          awk '/^nfs.mount-rmtab/{print $2}' | \ -          xargs dirname ) +NFSDIR=/var/lib/glusterd/nfs  ## Wait for volume to register with rpc.mountd  EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available @@ -186,6 +190,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available  ## Mount NFS  EXPECT "Y" check_mount_success $V0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + +## Mount NFS using the IPv6 export +export_allow_this_host_ipv6 +EXPECT "Y" check_mount_success $V0  ## Disallow host  TEST export_deny_this_host diff --git a/tests/basic/write-behind.t b/tests/basic/write-behind.t new file mode 100644 index 00000000000..edad59786af --- /dev/null +++ b/tests/basic/write-behind.t @@ -0,0 +1,53 @@ +#!/bin/bash +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +function clear_stats { +        > /var/lib/glusterfs/stats/glusterfs_d_backends_${V0}0.dump +} + +function got_expected_write_count { +        expected_size=$1 +        expected_value=$2 +        grep aggr.write_${expected_size} "/var/lib/glusterd/stats/glusterfsd__d_backends_${V0}0.dump" | grep $expected_value +        if [ $? == 0 ]; then +          echo "Y"; +        else +          echo "N"; +        fi +} + +cleanup; + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} + +# These are needed for our tracking of write sizes +TEST $CLI volume set $V0 diagnostics.latency-measurement on +TEST $CLI volume set $V0 diagnostics.count-fop-hits on +TEST $CLI volume set $V0 diagnostics.stats-dump-interval 2 + +# Disable this in testing to get deterministic results +TEST $CLI volume set $V0 performance.write-behind-trickling-writes off + +TEST $CLI volume start $V0 + +sleep 2; + +TEST glusterfs -s $H0 --volfile-id $V0 $M0 + +# Write a 100MB file with a window-size 1MB, we should get 100 writes of 1MB each +TEST dd if=/dev/zero of=$M0/100mb_file bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "1mb" 100 + +TEST $CLI volume set $V0 performance.write-behind-window-size 512KB + +# Write a 100MB file with a window-size 512KB, we should get 200 writes of 512KB each +TEST dd if=/dev/zero of=$M0/100mb_file_2 bs=1M count=100 +EXPECT_WITHIN 5 "Y" got_expected_write_count "512kb" 200 + +cleanup; diff --git a/tests/bugs/distribute/bug-1099890.t b/tests/bugs/distribute/bug-1099890.t index 40f70d4938b..29ceccf2309 100644 --- a/tests/bugs/distribute/bug-1099890.t +++ b/tests/bugs/distribute/bug-1099890.t @@ -44,6 +44,8 @@ TEST   $CLI volume set $V0 features.quota-deem-statfs on  TEST   $CLI volume quota $V0 limit-usage / 150MB; +TEST   $CLI volume set $V0 cluster.du-refresh-interval-sec 1 +  TEST   $CLI volume set $V0 cluster.min-free-disk 50%  TEST   glusterfs -s $H0 --volfile-id=$V0 $M0 diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t index c5a7f041ac8..8cf905a8f0b 100755 --- a/tests/bugs/distribute/bug-1161311.t +++ b/tests/bugs/distribute/bug-1161311.t @@ -53,8 +53,14 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0;  TEST mkdir $M0/dir1  TEST mkdir -p $M0/dir2/dir3 -# Create a large file (1GB), so that rebalance takes time -dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 +# Create a large file (6.4 GB), so that rebalance takes time +# Reading from /dev/urandom is slow, so we'll cat it together +dd if=/dev/urandom of=/tmp/FILE2 bs=64k count=10240 +for i in {1..10}; do +  cat /tmp/FILE2 >> $M0/dir1/FILE2 +done + +#dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240  # Rename the file to create a linkto, for rebalance to  # act on the file diff --git a/tests/bugs/fuse/bug-858488-min-free-disk.t b/tests/bugs/fuse/bug-858488-min-free-disk.t index 635dc04d1e6..ab636575d3f 100644 --- a/tests/bugs/fuse/bug-858488-min-free-disk.t +++ b/tests/bugs/fuse/bug-858488-min-free-disk.t @@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2  ## Lets create volume  TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}; +TEST $CLI volume set $V0 cluster.du-refresh-interval-sec 1  ## Verify volume is created  EXPECT "$V0" volinfo_field $V0 'Volume Name'; diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t index 9fc7ac3b845..3bc80ab9dab 100644 --- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t +++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t @@ -1,6 +1,6 @@  #!/bin/bash -## Test case for cluster.min-free-disk option validation. +## Test case for cluster.cluster.min-free-disk option validation.  . $(dirname $0)/../../include.rc @@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2  TEST $CLI volume start $V0  ## Setting invalid value for option cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk 143.!/12 -TEST ! $CLI volume set $V0 min-free-disk 123% -TEST ! $CLI volume set $V0 min-free-disk 194.34% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12 +TEST ! $CLI volume set $V0 cluster.min-free-disk 123% +TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34%  ## Setting fractional value as a size (unit is byte) for option  ## cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk 199.051 -TEST ! $CLI volume set $V0 min-free-disk 111.999 +TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051 +TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999  ## Setting valid value for option cluster.min-free-disk should pass -TEST  $CLI volume set $V0 min-free-disk 12% -TEST  $CLI volume set $V0 min-free-disk 56.7% -TEST  $CLI volume set $V0 min-free-disk 120 -TEST  $CLI volume set $V0 min-free-disk 369.0000 +TEST  $CLI volume set $V0 cluster.min-free-disk 12% +TEST  $CLI volume set $V0 cluster.min-free-disk 56.7% +TEST  $CLI volume set $V0 cluster.min-free-disk 120 +TEST  $CLI volume set $V0 cluster.min-free-disk 369.0000  cleanup; diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t index c30d2b852d4..1b9ca18c08a 100755 --- a/tests/bugs/glusterd/bug-859927.t +++ b/tests/bugs/glusterd/bug-859927.t @@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes "     "  TEST   $CLI volume set $V0 min-free-inodes 60%  EXPECT "60%" volume_option $V0 cluster.min-free-inodes -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk "     " -TEST   $CLI volume set $V0 min-free-disk 60% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk "     " +TEST   $CLI volume set $V0 cluster.min-free-disk 60%  EXPECT "60%" volume_option $V0 cluster.min-free-disk -TEST   $CLI volume set $V0 min-free-disk 120 +TEST   $CLI volume set $V0 cluster.min-free-disk 120  EXPECT "120" volume_option $V0 cluster.min-free-disk  TEST ! $CLI volume set $V0 frame-timeout "" diff --git a/tests/bugs/quota/bug-1292020.t b/tests/bugs/quota/bug-1292020.t index 14b311c9d76..f713c74859b 100644 --- a/tests/bugs/quota/bug-1292020.t +++ b/tests/bugs/quota/bug-1292020.t @@ -4,10 +4,12 @@  . $(dirname $0)/../../volume.rc  function write_sample_data () { -        dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 | grep -i exceeded +        dd if=/dev/zero of=$M0/f1 bs=256k count=400 2>&1 | +            egrep -i 'exceeded|no space' && echo 'passed'  }  cleanup; +rm -f /tmp/kbv.log  TEST glusterd;  TEST pidof glusterd; @@ -18,7 +20,8 @@ TEST $CLI volume quota $V0 enable;  TEST $CLI volume quota $V0 limit-usage / 1  TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0; -EXPECT "exceeded" write_sample_data + +EXPECT "passed" write_sample_data  TEST $CLI volume stop $V0  TEST $CLI volume delete $V0 diff --git a/tests/cluster.rc b/tests/cluster.rc index 467bbcb06e1..42547f09e37 100644 --- a/tests/cluster.rc +++ b/tests/cluster.rc @@ -46,17 +46,18 @@ function define_glusterds() {          bopt="management.transport.socket.bind-address=${!h}";          popt="--pid-file=${!b}/glusterd.pid";          sopt="management.glusterd-sockfile=${!b}/glusterd/gd.sock" +        aopt="*.transport.address-family=inet"          #Get the logdir          logdir=`gluster --print-logdir`          #Fetch the testcases name and prefix the glusterd log with it          logfile=`echo ${0##*/}`_glusterd$i.log          lopt="--log-file=$logdir/$logfile"          if [ "$2" == "-LDEBUG" ]; then -            eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; -            eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; +            eval "glusterd_$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; +            eval "glusterd$i='glusterd -LDEBUG --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";          else -            eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; -            eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt $lopt $popt'"; +            eval "glusterd_$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'"; +            eval "glusterd$i='glusterd --xlator-option $wopt --xlator-option $bopt --xlator-option $sopt --xlator-option $aopt $lopt $popt'";          fi      done  } diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6 new file mode 100644 index 00000000000..426b1ef5705 --- /dev/null +++ b/tests/configfiles/exports-v6 @@ -0,0 +1 @@ +/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) diff --git a/tests/env.rc.in b/tests/env.rc.in index 82971c4a8de..87befc3711d 100644 --- a/tests/env.rc.in +++ b/tests/env.rc.in @@ -28,3 +28,6 @@ export PYTHON  PYTHONPATH=@BUILD_PYTHON_SITE_PACKAGES@:$PYTHON_PATH  export PYTHONPATH + +TESTER_CFLAGS="@TESTER_CFLAGS@" +export TESTER_CFLAGS diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t new file mode 100755 index 00000000000..4372998681f --- /dev/null +++ b/tests/features/brick-min-free-space.t @@ -0,0 +1,113 @@ +#!/bin/bash +# +# Test storage.min-free-disk option works. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd + +TEST truncate -s 16M $B0/brick0 +TEST LOOPDEV=$(losetup --find --show $B0/brick0) +TEST mkfs.xfs $LOOPDEV + +mkdir -p $B0/$V0 + +TEST mount -t xfs $LOOPDEV $B0/$V0 + +########### +# AIO on  # +########### + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio on + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +############ +# AIO off  # +############ + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +cleanup; diff --git a/tests/features/lock_revocation.t b/tests/features/lock_revocation.t new file mode 100644 index 00000000000..cbf21b71650 --- /dev/null +++ b/tests/features/lock_revocation.t @@ -0,0 +1,52 @@ +#!/bin/bash +logdir=$(gluster --print-logdir) +BRICK_LOGFILES="$logdir/bricks/d-backends-brick?.log" +rm -f $BRICK_LOGFILES &> /dev/null + +# Test that lock revocation works + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc +cleanup; + +function deadlock_fop() { +  local MNT=$1 +  for i in {1..1000}; do +    dd if=/dev/zero of=$MNT/testfile bs=1k count=10 &> /dev/null +    if grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null; then +      break +    fi +  done +} + +function monkey_unlock() { +  grep "MONKEY LOCKING" $BRICK_LOGFILES &> /dev/null && echo SUCCESS +  return 0 +} + +function append_to_file() { +  local FILE_PATH=$1 +  echo "hello" >> $FILE_PATH +  return 0 +} + +#Init +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} +TEST $CLI volume set $V0 self-heal-daemon off +TEST $CLI volume set $V0 features.locks-monkey-unlocking on +TEST $CLI volume set $V0 features.locks-revocation-secs 2 +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=$V0 -s $H0 $M0; +TEST $GFS --volfile-id=$V0 -s $H0 $M1; + +# Deadlock writes to a file using monkey unlocking +deadlock_fop $M0 & +EXPECT_WITHIN 60 "SUCCESS" monkey_unlock + +# Sleep > unlock timeout and attempt to write to the file +sleep 3 +TEST append_to_file $M1/testfile + +cleanup diff --git a/tests/halo.rc b/tests/halo.rc new file mode 100644 index 00000000000..4cb7c81da85 --- /dev/null +++ b/tests/halo.rc @@ -0,0 +1,52 @@ +# Return the current Halo state of a given child (by index, i.e. 0 +# is first child). +function halo_child_state { +    grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG | +         tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//' +} + +# Return number of Halo children which are in a given state. +# First parameter is total # children. +# Second parameter is state to match (e.g. "UP"). +function halo_children_in_state { +    local CHILD_COUNT=$1 +    local SUM=0 +    for CHILD in $(seq 0 $((CHILD_COUNT-1))); do +        if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then +            SUM=$((SUM+1)) +        fi +    done +    echo $SUM +} + +# Return number of up halo children, +# First parameter is total # children, +function halo_children_up { +    echo $(halo_children_in_state $1 "UP") +} + +# Return number of down halo children, +# First parameter is total # children, +function halo_children_down { +    echo $(halo_children_in_state $1 "DOWN") +} + +# Return number of up & down halo children. +# First parameter is total number of children. +function halo_sum_child_states { +    local CHILD_COUNT=$1 + +    local UP=0 +    local DOWN=0 + +    for CHILD in $(seq 0 $((CHILD_COUNT-1))); do +        local STATE=$(halo_child_state $CHILD) +        if [ x"$STATE" == x"UP" ]; then +            UP=$((UP+1)) +        elif [ x"$STATE" == x"DOWN" ]; then +            DOWN=$((DOWN+1)) +        fi +    done + +    echo "$UP $DOWN" +} diff --git a/tests/include.rc b/tests/include.rc index 492e35a7b6c..9f32e88f5f5 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -19,6 +19,8 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g  CC=cc  OSTYPE=$(uname -s) +M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point +  ENV_RC=$(dirname $0)/../env.rc  if [ ! -f $ENV_RC ]; then     ENV_RC=$(dirname $0)/../../env.rc @@ -612,6 +614,7 @@ function build_tester ()      then         cflags="$cflags $(pkg-config glusterfs-api --cflags-only-I --libs-only-L)"      fi +    cflags="$cflags ${TESTER_CFLAGS}"      $CC -g -o $(dirname $cfile)/$execname $cfile $cflags  } diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 903fbb39f12..bce94bb8b3b 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht ec +SUBDIRS = aha stripe afr dht ec  CLEANFILES =  diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 4d9327b8a11..fb3318da36a 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -43,6 +43,13 @@  #include "afr-self-heald.h"  #include "afr-messages.h" +#define CHILD_UP_STR "UP" +#define CHILD_DOWN_STR "DOWN" +#define CHILD_DISCONNECTED_STR "DOWN" + +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *fastest_children); +  call_frame_t *  afr_copy_frame (call_frame_t *base)  { @@ -1454,21 +1461,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)                               sizeof(gfid_copy)) % child_count;  } +/* + * afr_halo_read_subvol + * + * Given a array representing the readable children, this function will + * return which one of the readable children meet the halo hybrid criteria. + * In the event none are found, -1 is returned and another strategy will have + * to be used to figure out where the read should come from. + */ +int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) { +	afr_private_t *priv = NULL; +        unsigned char *hybrid_children; +        int32_t hybrid_cnt = 0; +	int read_subvol = -1; +	int i = 0; + +	priv = this->private; + +        /* Halo in-active or hybrid mode disabled, bail.... */ +        if (!priv->halo_enabled || !priv->halo_hybrid_mode) +                return -1; + +        /* AFR Discovery edge case, if you are already pinned to a child +         * which meets the latency threshold then go with this child for +         * consistency purposes. +         */ +        if (priv->read_child >= 0 && readable[priv->read_child] && +            priv->child_latency[priv->read_child] <= +             AFR_HALO_HYBRID_LATENCY_MSEC) { +                return priv->read_child; +        } + +        hybrid_children = alloca0 (priv->child_count); +        hybrid_cnt = find_hybrid_children (this, hybrid_children); +        if (hybrid_cnt) { +                for (i = 0; i < priv->child_count; i++) { +                        if (readable[i] && hybrid_children[i]) { +                                read_subvol = i; +                                priv->read_child = read_subvol; +                                gf_log (this->name, GF_LOG_TRACE, +                                        "Selected hybrid child %d for reads", +                                        i); +                                break; +                        } +                } +        } + +        return read_subvol; +} +  int  afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,  				  unsigned char *readable,                                    afr_read_subvol_args_t *args)  { -	int             i           = 0; -	int             read_subvol = -1; -	afr_private_t  *priv        = NULL; +	    int             i           = 0; +	    int             read_subvol = -1; +	    afr_private_t  *priv        = NULL;          afr_read_subvol_args_t local_args = {0,}; -	priv = this->private; +	    priv = this->private; + +        /* Choose lowest latency child for reads */ +        read_subvol = afr_halo_read_subvol (this, readable); +        if (read_subvol != -1) +                return read_subvol; -	/* first preference - explicitly specified or local subvolume */ -	if (priv->read_child >= 0 && readable[priv->read_child]) +        /* first preference - explicitly specified or local subvolume */ +	    if (priv->read_child >= 0 && readable[priv->read_child])                  return priv->read_child;          if (inode_is_linked (inode)) { @@ -1494,7 +1555,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,          return -1;  } -  int  afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,  				unsigned char *readable, int *event_p, @@ -2154,6 +2214,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          priv->children[child_index]->name);                  priv->read_child = child_index; +        } else if (priv->halo_enabled) { +                if (priv->read_child < 0) { +                        priv->read_child = child_index; +                } else if (priv->child_latency[child_index] < +                    priv->child_latency[priv->read_child]) { +                        priv->read_child = child_index; +                }          }  out:          STACK_DESTROY(frame->root); @@ -2345,7 +2412,6 @@ unwind:          return 0;  } -  int  afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)  { @@ -2571,6 +2637,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)  	afr_local_t *local = NULL;  	afr_private_t *priv = NULL;  	int call_count = 0; +        unsigned char *hybrid_children = NULL;  	local = frame->local;  	priv = this->private; @@ -2581,8 +2648,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)  		goto out;  	} -	call_count = local->call_count = AFR_COUNT (local->child_up, -						    priv->child_count); +        hybrid_children = alloca0 (priv->child_count); +        call_count = find_hybrid_children (this, hybrid_children); +        if (call_count) { +                for (i = 0; i < priv->child_count; i++) +                        local->child_up[i] = hybrid_children[i]; +                gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid " +                        "children for LOOKUPs", call_count); +        } else { +                hybrid_children = NULL; +                call_count = AFR_COUNT (local->child_up, priv->child_count); +        } + +        local->call_count = call_count;          ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,  					    &local->loc); @@ -2815,6 +2893,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)  	afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,  			     AFR_DATA_TRANSACTION, NULL); +        /* So this is the "secret" to why "Hybrid" halo works.  Encoded in +         * the cached inodes, we store what is effectively the "generational" +         * state of the cluster along with a "packed" version of the extended +         * attributes which determine which nodes are wise/fools.  We can +         * consult these cached values to figure out who we can trust, in the +         * event the state of our cluster changes and we can no longer trust +         * the cached info we "refresh" the inode (and hit all regions) to +         * ensure we know which bricks we can safely read from. +         */  	if (event != local->event_generation)  		afr_inode_refresh (frame, this, loc->parent, NULL,                                     afr_lookup_do); @@ -3039,7 +3126,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          }          UNLOCK (&frame->lock); -	call_count = afr_frame_return (frame); +        call_count = afr_frame_return (frame);  	if (call_count == 0)  		AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4287,25 +4374,569 @@ __get_heard_from_all_status (xlator_t *this)          return heard_from_all;  } +/* + * afr_cmp_child + * + * Passed to the qsort function to order a list of children by the latency + * and/or up/down states. + * + * Note: This isn't as simple as taking the latencies and calling it a + * a day.  Children can be marked down, which overrides their latency + * signal.  Having a lower-latency child available doesn't guarentee this + * child shall be marked up: we don't want to constantly be swapping + * slightly better bricks for others...this is jarring to clients and + * could cause all sorts of issues.  Plus, the fail-over, max-replicas + * flags must all be honored which manage the up/down state of children. + * + * In short, the (as marked) up/down down state of the brick shall always + * take precedence when sorting by latency. + */ +static int +_afr_cmp_child (const void *child1, const void *child2) +{ +        struct afr_child *child11 = (struct afr_child *)child1; +        struct afr_child *child22 = (struct afr_child *)child2; + +        /* If both children are _marked_ down they are equal */ +        if (!child11->child_up && !child22->child_up) +                return 0; + +        /* Prefer child 2, child 1 is _marked_ down, child 2 is not */ +        if (!child11->child_up && child22->child_up) +                return 1; + +        /* Prefer child 1, child 2 is _marked_ down, child 1 is not */ +        if (child11->child_up && !child22->child_up) +                return -1; + +        if (child11->latency > child22->latency) { +                return 1; +        } +        if (child11->latency == child22->latency) { +                return 0; +        } +        return -1; +} + +/* + * find_hybrid_children + * + * Given a char array representing our children (aka bricks within our AFR + * AFR "subvolume"), we'll mark this array with the children which are + * within the halo_hybrid_read_max_latency_sec or if none fit this condition, + * we'll pick the fastest two bricks. + * + * You might ask, why not just pick the quickest brick and be done with it? + * Well, being within our set is not suffcient to be chosen for the read, + * we must also be marked "readable", we still want to choose as many as + * we can within our local region to ensure we have somebody that is readable. + * + * To illustrate this, consider the case where a 1/2 bricks received a sync + * from some other writer, and the 2nd brick although faster wasn't present. + * In this case we'll want to use the slower brick to service the read. + * + * In short, this function just tells the caller which hybrid children, + * it gives no signal as to their readability, nor should it since this is + * handled later in the various flows (e.g. by afr_halo_read_subvol). + */ +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *hybrid_children) +{ +        int32_t i = 0; +        afr_private_t *priv = NULL; +        struct afr_child   *sorted_list = NULL; +        uint32_t max_latency; +        uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT; + +        priv = this->private; + +        if (!priv->halo_enabled || !priv->halo_hybrid_mode) +                return 0; + +        if (limit > priv->child_count) +                limit = priv->child_count; + +        max_latency = priv->halo_hybrid_read_max_latency_msec; + +        sorted_list = alloca (sizeof (struct afr_child) * priv->child_count); + +        /* Find children meeting the latency threshold */ +        for (i = 0; i < priv->child_count; i++) { +                sorted_list[i].idx = i; +                sorted_list[i].child_up = priv->child_up[i]; +                sorted_list[i].latency = priv->child_latency[i]; +        } + +        /* QuickSort the children according to latency */ +        qsort (sorted_list, priv->child_count, sizeof (struct afr_child), +               _afr_cmp_child); + +        i = 0; +        while (i < priv->child_count && sorted_list[i].latency <= max_latency) +                hybrid_children[sorted_list[i++].idx] = 1; + +        /* Found some candidates */ +        if (i != 0) +                return i; + +        /* If no candidates can be found meeting the max_latency threshold +         * then find the best of those we have to our limit. +         */ +        for (i = 0; i < limit; i++) +                hybrid_children[sorted_list[i].idx] = 1; + +        return i; +} + +int +find_best_down_child (xlator_t *this) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int32_t         best_child          = -1; +        int64_t         best_latency        = INT64_MAX; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (!priv->child_up[i] && +                    priv->child_latency[i] >= 0 && +                    priv->child_latency[i] < best_latency) { +                        best_child = i; +                        best_latency = priv->child_latency[i]; +                } +        } +        if (best_child >= 0) { +                gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) " +                        "@ %ld ms latency", best_child, best_latency); +        } +        return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int32_t         worst_child         = -1; +        int64_t         worst_latency       = INT64_MIN; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (priv->child_up[i] && +                    priv->child_latency[i] >= 0 && +                    priv->child_latency[i] >= worst_latency) { +                        worst_child = i; +                        worst_latency = priv->child_latency[i]; +                } +        } +        if (worst_child >= 0) { +                gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)" +                        " @ %ld ms latency", worst_child, worst_latency); +        } +        return worst_child; +} + +static const char *halo_state_str(int i) +{ +    switch (i) { +    case 0: return "DOWN"; +    case 1: return "UP"; +    } + +    return "unknown"; +} + + +static void dump_halo_states (xlator_t *this) { +        afr_private_t   *priv               = NULL; +        int             i                   = -1; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Child %d halo state: %s (N/A)", +                                i, +                                halo_state_str(priv->child_up[i])); +                 } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Child %d halo state: %s (%"PRIi64" ms)", +                                i, +                                halo_state_str(priv->child_up[i]), +                                priv->child_latency[i]); +                } +        } +} + +static void +_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, +                const int idx, const int64_t halo_max_latency_msec, +                int32_t *event, int64_t *child_latency_msec, +                gf_boolean_t child_halo_enabled) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int             up_children         = 0; +        int             best_down_child     = 0; +        uint64_t        latency_samples     = 0; + +        priv = this->private; + +        /* Base it off the _minimum_ latency we've ever seen */ +        *child_latency_msec = child_xlator->client_latency.min / 1000.0; +        latency_samples = child_xlator->client_latency.count; +        priv->child_latency[idx] = *child_latency_msec; + +        for (i = 0; i < priv->child_count; i++) { +                if (priv->child_up[i] == 1) { +                        up_children++; +                } +        } + +        /* Don't do anything until you have some minimum numbner of +         * latency samples */ +        if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) { +                gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient " +                        " number of latency samples (%" PRIu64 +                        " < %d), halo in-active.", +                        latency_samples, priv->halo_min_samples); +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)" +                " up_count %d (min %d) enabled %s", +                idx, child_xlator ? child_xlator->name : "<null>", +                *child_latency_msec, +                halo_max_latency_msec, +                up_children, +                priv->halo_min_replicas, +                child_halo_enabled ? "true" : "false"); + +        /* +         * Case 1: This child's latency exceeds the maximum allowable +         * for this halo. +         */ +        if (child_halo_enabled && +            *child_latency_msec > halo_max_latency_msec && +            priv->child_up[idx] == 1 && +            up_children > priv->halo_min_replicas) { +                if (find_worst_up_child (this) == idx) { +                        gf_log (child_xlator->name, GF_LOG_INFO, +                                "Child latency (%"PRIi64"ms) " +                                 "exceeds halo threshold (%"PRIi64"), " +                                 "marking child down, " +                                 "min_replicas (%d) still " +                                 "satisfied.", +                                 *child_latency_msec, +                                 halo_max_latency_msec, +                                 priv->halo_min_replicas); +                        *event = GF_EVENT_CHILD_DOWN; +                } +        /* +         * Case 2: Child latency is within halo and currently marked down, +         * mark it up. +         */ +        } else if ((child_halo_enabled == _gf_false || +                    *child_latency_msec <= halo_max_latency_msec) && +                   priv->child_up[idx] == 0) { +                if (child_halo_enabled == _gf_false || +                        up_children < priv->halo_max_replicas) { +                        gf_log (child_xlator->name, GF_LOG_INFO, +                                "Child latency (%ld ms) " +                                "below halo threshold (%ld) or halo is " +                                "disabled, marking child up.", +                                *child_latency_msec, +                                halo_max_latency_msec); +                        *event = GF_EVENT_CHILD_UP; +                } else { +                        gf_log (child_xlator->name, GF_LOG_INFO, +                            "Not marking child %d up, " +                            "max replicas (%d) reached.", idx, +                            priv->halo_max_replicas); +                } +        /* +         * Case 3: Child latency is within halo,and currently marked up, +         * mark it down if it's the highest latency child and the +         * number of up children is greater than halo_max_replicas. +         * UNLESS you are an SHD in which case do nothing. +         */ +        } else if ((child_halo_enabled == _gf_true && +                        *child_latency_msec <= halo_max_latency_msec) && +                        priv->child_up[idx] == 1) { +                if (find_worst_up_child (this) == idx && +                                up_children > priv->halo_max_replicas && +                                !priv->shd.iamshd) { +                        gf_log (child_xlator->name, GF_LOG_INFO, +                                "Child latency (%"PRIi64"ms) " +                                "exceeds halo threshold (%"PRIi64"), " +                                "but halo_max_replicas (%d) exceeded, " +                                "marking child down.", +                                *child_latency_msec, +                                halo_max_latency_msec, +                                priv->halo_max_replicas); +                        *event = GF_EVENT_CHILD_DOWN; +                } +        } + +        if (*event != GF_EVENT_CHILD_PING && +            gf_log_get_loglevel () >= GF_LOG_DEBUG) { +                gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:"); +                dump_halo_states (this); +        } +} + +void +_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, +                const int idx, int64_t halo_max_latency_msec, +                int32_t *event, int32_t *call_psh, int32_t *up_child, +                gf_boolean_t child_halo_enabled) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int             up_children         = 0; +        int             worst_up_child      = -1; +        gf_boolean_t    was_down            = _gf_false; + +        priv = this->private; + +       /* +         * This only really counts if the child was never up +         * (value = -1) or had been down (value = 0).  See +         * comment at GF_EVENT_CHILD_DOWN for a more detailed +         * explanation. +         */ +        if (priv->child_up[idx] != 1) { +                /* +                 * Track the fact we did this, we may need to repeal this +                 * if we later decide to mark this brick down. +                 */ +                was_down = _gf_true; +                priv->event_generation++; +        } +        priv->child_up[idx] = 1; + +        *call_psh = 1; +        *up_child = idx; +        for (i = 0; i < priv->child_count; i++) +                if (priv->child_up[i] == 1) +                        up_children++; + +        /* +         * Handle the edge case where we exceed +         * halo_min_replicas and we've got a child which is +         * marked up as it was helping to satisfy the +         * halo_min_replicas even though it's latency exceeds +         * halo_max_latency_msec. +         */ +        if (child_halo_enabled == _gf_true && +            up_children > priv->halo_min_replicas) { +                worst_up_child = find_worst_up_child (this); +                if (worst_up_child >= 0 && +                    priv->child_latency[worst_up_child] > +                    halo_max_latency_msec) { +                        if (was_down == _gf_true) +                                priv->event_generation--; +                        *call_psh = 0; +                        priv->child_up[worst_up_child] = 0; +                        up_children--; +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Marking child %d down, " +                                "doesn't meet halo threshold " +                                "(%ld), and > " +                                "halo_min_replicas (%d)", +                                worst_up_child, +                                halo_max_latency_msec, +                                priv->halo_min_replicas); +                        goto out; +                } +        } +        if (priv->halo_enabled && +            up_children > priv->halo_max_replicas && +            !priv->shd.iamshd) { +                if (was_down == _gf_true) +                        priv->event_generation--; +                *call_psh = 0; +                worst_up_child = find_worst_up_child (this); +                if (worst_up_child < 0) { +                        worst_up_child = idx; +                } +                priv->child_up[worst_up_child] = 0; +                gf_log (this->name, GF_LOG_INFO, +                        "Marking child %d down, " +                        "up_children (%d) > " +                        "halo_max_replicas (%d)", +                        worst_up_child, +                        up_children, +                        priv->halo_max_replicas); +                up_children--; +                goto out; +        } +out: +        if (up_children == 1) { +                gf_log (this->name, GF_LOG_INFO, +                        "Subvolume '%s' came back up; " +                        "going online.", +                        child_xlator->name); +        } else { +                *event = GF_EVENT_CHILD_MODIFIED; +        } + +        priv->last_event[idx] = *event; + +        if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { +                gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); +                dump_halo_states (this); +        } +} + +void +_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, +                int idx, int64_t child_latency_msec, +                int64_t halo_max_latency_msec, int32_t *event, +                int32_t *call_psh, int32_t *up_child, +                gf_boolean_t child_halo_enabled) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int             up_children         = 0; +        int             down_children       = 0; +        int             best_down_child     = -1; +        gf_boolean_t    swap_child          = _gf_false; + +        priv = this->private; + +        /* +         * If a brick is down when we start, we'll get a +         * CHILD_DOWN to indicate its initial state.  There +         * was never a CHILD_UP in this case, so if we +         * increment "down_count" the difference between than +         * and "up_count" will no longer be the number of +         * children that are currently up.  This has serious +         * implications e.g. for quorum enforcement, so we +         * don't increment these values unless the event +         * represents an actual state transition between "up" +         * (value = 1) and anything else. +         */ +        if (priv->child_up[idx] == 1) { +                priv->event_generation++; +        } + +        /* +         * If this is an _actual_ CHILD_DOWN event, we +         * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to +         * indicate the child is really disconnected. +         */ +        if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) { +                priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY; +        } +        priv->child_up[idx] = 0; + +        for (i = 0; i < priv->child_count; i++) +                if (priv->child_up[i] == 1) +                        up_children++; + +        /* +         * Handle the edge case where we need to find the +         * next best child (to mark up) as marking this child +         * down would cause us to fall below halo_min_replicas. +         * We will also force the SHD to heal this child _now_ +         * as we want it to be up to date if we are going to +         * begin using it synchronously. +         */ +        best_down_child = find_best_down_child (this); +        if (child_halo_enabled == _gf_true) { +                if (up_children < priv->halo_min_replicas && +                                priv->halo_failover_enabled == _gf_true) +                        swap_child = _gf_true; +                else if (up_children < priv->halo_max_replicas && +                                priv->child_latency[best_down_child] <= +                                halo_max_latency_msec && +                                priv->halo_failover_enabled == _gf_true) +                        swap_child = _gf_true; +        } + +        if (swap_child) { +                if (best_down_child >= 0) { +                        gf_log (this->name, GF_LOG_INFO, +                                "Swapping out child %d for " +                                "child %d to satisfy " +                                "halo_min_replicas (%d).", +                                idx, best_down_child, +                                priv->halo_min_replicas); +                        priv->child_up[best_down_child] = 1; +                        *call_psh = 1; +                        *up_child = best_down_child; +                } +        } + +        for (i = 0; i < priv->child_count; i++) +                if (priv->child_up[i] == 0) +                        down_children++; +        if (down_children == priv->child_count) { +                gf_log (this->name, GF_LOG_ERROR, +                        "All subvolumes are down. Going " +                        "offline until atleast one of them " +                        "comes back up."); +        } else { +                *event = GF_EVENT_CHILD_MODIFIED; +        } +        priv->last_event[idx] = *event; + +        if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { +                gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); +                dump_halo_states (this); +        } +} + +int64_t +_afr_get_halo_latency (xlator_t *this) +{ +        afr_private_t *priv           = NULL; +        int64_t halo_max_latency_msec = 0; + +        priv = this->private; + +        if (priv->shd.iamshd) { +                halo_max_latency_msec = priv->shd.halo_max_latency_msec; +        } else if (priv->nfsd.iamnfsd) { +                halo_max_latency_msec = +                        priv->nfsd.halo_max_latency_msec; +        } else { +                halo_max_latency_msec = priv->halo_max_latency_msec; +        } +        gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld", +                halo_max_latency_msec); +        return halo_max_latency_msec; +} + +  int32_t  afr_notify (xlator_t *this, int32_t event,              void *data, void *data2)  { +        xlator_t        *child_xlator       = NULL;          afr_private_t   *priv               = NULL;          int             i                   = -1; -        int             up_children         = 0; -        int             down_children       = 0;          int             propagate           = 0;          int             had_heard_from_all  = 0;          int             have_heard_from_all = 0;          int             idx                 = -1;          int             ret                 = -1;          int             call_psh            = 0; +        int             up_child            = -1; +        uint64_t        latency_samples     = 0;          dict_t          *input              = NULL;          dict_t          *output             = NULL;          gf_boolean_t    had_quorum          = _gf_false;          gf_boolean_t    has_quorum          = _gf_false; +        int64_t         halo_max_latency_msec = 0; +        int64_t         child_latency_msec   = AFR_CHILD_DOWN_LATENCY; +        gf_boolean_t    child_halo_enabled   = _gf_false; +        child_xlator = (xlator_t *)data;          priv = this->private;          if (!priv) @@ -4318,7 +4949,7 @@ afr_notify (xlator_t *this, int32_t event,           * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.           */          priv->did_discovery = _gf_false; - +        latency_samples = child_xlator->client_latency.count;          /* parent xlators dont need to know about every child_up, child_down           * because of afr ha. If all subvolumes go down, child_down has @@ -4329,7 +4960,7 @@ afr_notify (xlator_t *this, int32_t event,           * subsequent revalidate lookup happens on all the dht's subvolumes           * which triggers afr self-heals if any.           */ -        idx = find_child_index (this, data); +        idx = find_child_index (this, child_xlator);          if (idx < 0) {                  gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,                          "Received child_up from invalid subvolume"); @@ -4338,6 +4969,28 @@ afr_notify (xlator_t *this, int32_t event,          had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,                                                             this); + +        if (!priv->halo_enabled || +            latency_samples < priv->halo_min_samples) { +                child_halo_enabled = _gf_false; +                halo_max_latency_msec = INT64_MAX; +        } else { +                child_halo_enabled = _gf_true; +                halo_max_latency_msec = _afr_get_halo_latency (this); +        } + +        if (event == GF_EVENT_CHILD_PING) { +                /* Calculates the child latency and sets event +                 */ +                LOCK (&priv->lock); +                { +                        _afr_handle_ping_event (this, child_xlator, idx, +                                halo_max_latency_msec, &event, +                                &child_latency_msec, child_halo_enabled); +                } +                UNLOCK (&priv->lock); +        } +          if (event == GF_EVENT_TRANSLATOR_OP) {                  LOCK (&priv->lock);                  { @@ -4364,52 +5017,16 @@ afr_notify (xlator_t *this, int32_t event,                          propagate = 1;                          break;                  case GF_EVENT_CHILD_UP: -                        /* -                         * This only really counts if the child was never up -                         * (value = -1) or had been down (value = 0).  See -                         * comment at GF_EVENT_CHILD_DOWN for a more detailed -                         * explanation. -                         */ -                        if (priv->child_up[idx] != 1) { -                                priv->event_generation++; -                        } -                        priv->child_up[idx] = 1; - -                        call_psh = 1; -                        up_children = __afr_get_up_children_count (priv); -                        if (up_children == 1) { -                                gf_msg (this->name, GF_LOG_INFO, 0, -                                        AFR_MSG_SUBVOL_UP, -                                        "Subvolume '%s' came back up; " -                                     "going online.", ((xlator_t *)data)->name); -                        } else { -                                event = GF_EVENT_CHILD_MODIFIED; -                        } - -                        priv->last_event[idx] = event; - +                        _afr_handle_child_up_event (this, child_xlator, +                                idx, halo_max_latency_msec, &event, &call_psh, +                                &up_child, child_halo_enabled);                          break;                  case GF_EVENT_CHILD_DOWN: -                        if (priv->child_up[idx] == 1) { -                                priv->event_generation++; -                        } -                        priv->child_up[idx] = 0; - -                        for (i = 0; i < priv->child_count; i++) -                                if (priv->child_up[i] == 0) -                                        down_children++; -                        if (down_children == priv->child_count) { -                                gf_msg (this->name, GF_LOG_ERROR, 0, -                                        AFR_MSG_ALL_SUBVOLS_DOWN, -                                       "All subvolumes are down. Going offline " -                                    "until atleast one of them comes back up."); -                        } else { -                                event = GF_EVENT_SOME_CHILD_DOWN; -                        } - -                        priv->last_event[idx] = event; - +                        _afr_handle_child_down_event (this, child_xlator, idx, +                                child_latency_msec, halo_max_latency_msec, +                                &event, &call_psh, &up_child, +                                child_halo_enabled);                          break;                  case GF_EVENT_CHILD_CONNECTING: @@ -4436,7 +5053,6 @@ afr_notify (xlator_t *this, int32_t event,                             had come up, propagate CHILD_UP, but only this time                          */                          event = GF_EVENT_CHILD_DOWN; -                        up_children = __afr_get_up_children_count (priv);                          for (i = 0; i < priv->child_count; i++) {                                  if (priv->last_event[i] == GF_EVENT_CHILD_UP) {                                          event = GF_EVENT_CHILD_UP; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7f7962013d7..c7d6261b110 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -46,7 +46,8 @@ enum gf_afr_mem_types_ {  	gf_afr_mt_spbc_timeout_t,          gf_afr_mt_spb_status_t,          gf_afr_mt_empty_brick_t, -        gf_afr_mt_end +        gf_afr_mt_child_latency_t, +    gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 1b3b1ca0af1..9c12e433097 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -371,7 +371,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,  {  	afr_private_t *priv = NULL;  	off_t off = 0; -	size_t block = 128 * 1024; +	size_t block = 0;  	int type = AFR_SELFHEAL_DATA_FULL;  	int ret = -1;  	call_frame_t *iter_frame = NULL; @@ -383,6 +383,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,                  healed_sinks[ARBITER_BRICK_INDEX] = 0;          } +        block = 128 * 1024 * priv->data_self_heal_window_size; +          type = afr_data_self_heal_type_get (priv, healed_sinks, source,                                              replies); diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c6ac5ebfd1b..4ac1d32f58a 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -58,6 +58,7 @@ typedef struct {          eh_t                    **statistics;          uint32_t                max_threads;          uint32_t                wait_qlength; +        uint32_t                halo_max_latency_msec;  } afr_self_heald_t; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 6f4783c9213..ae9b28c7fb4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -176,6 +176,42 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("data-self-heal-algorithm",                            priv->data_self_heal_algorithm, options, str, out); +        GF_OPTION_RECONF ("halo-enabled", +                          priv->halo_enabled, options, bool, +                          out); + +        GF_OPTION_RECONF ("halo-failover-enabled", +                          priv->halo_failover_enabled, options, bool, +                          out); + +        GF_OPTION_RECONF ("halo-shd-max-latency", +                          priv->shd.halo_max_latency_msec, options, uint32, +                          out); + +        GF_OPTION_RECONF ("halo-nfsd-max-latency", +                          priv->nfsd.halo_max_latency_msec, options, uint32, +                          out); + +        GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, +                          options, uint32, out); + +        GF_OPTION_RECONF ("halo-hybrid-mode", +                          priv->halo_hybrid_mode, options, bool, +                          out); + +        GF_OPTION_RECONF ("halo-hybrid-read-max-latency", +                          priv->halo_hybrid_read_max_latency_msec, options, +                          uint32, out); + +        GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, +                              uint32, out); + +        GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, +                              uint32, out); + +        GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options, +                              uint32, out); +          GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);          GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -396,6 +432,35 @@ init (xlator_t *this)          GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); +        GF_OPTION_INIT ("halo-hybrid-mode", +                        priv->halo_hybrid_mode, bool, out); + +        GF_OPTION_INIT ("halo-hybrid-read-max-latency", +                        priv->halo_hybrid_read_max_latency_msec, uint32, +                        out); + +        GF_OPTION_INIT ("halo-enabled", +                        priv->halo_enabled, bool, out); + +        GF_OPTION_INIT ("halo-failover-enabled", +                        priv->halo_failover_enabled, bool, out); + +        GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, +                        uint32, out); +        GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, +                        uint32, out); +        GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, +                        out); +        GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, +                        out); +        GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32, +                        out); + +        GF_OPTION_INIT ("halo-nfsd-max-latency", +                        priv->nfsd.halo_max_latency_msec, uint32, out); + +        GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); +          GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);          GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -445,17 +510,24 @@ init (xlator_t *this)          priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,                                      gf_afr_mt_char); -        if (!priv->child_up) { + +        priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency), +                                         child_count, +                                         gf_afr_mt_child_latency_t); + +        if (!priv->child_up || !priv->child_latency) {                  ret = -ENOMEM;                  goto out;          } -        for (i = 0; i < child_count; i++) +        for (i = 0; i < child_count; i++) { +                priv->child_latency[i] = 0.0;                  priv->child_up[i] = -1; /* start with unknown state.                                             this initialization needed                                             for afr_notify() to work                                             reliably                                          */ +        }          priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,                                      gf_afr_mt_xlator_t); @@ -663,6 +735,85 @@ struct volume_options options[] = {                           "jobs that can perform parallel heals in the "                           "background."          }, +        { .key   = {"halo-shd-max-latency"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "99999", +           .description = "Maximum latency for shd halo replication in msec." +        }, +        { .key   = {"halo-enabled"}, +          .type  = GF_OPTION_TYPE_BOOL, +          .default_value = "False", +           .description = "Enable Halo (geo) replication mode." +        }, +        { .key   = {"halo-failover-enabled"}, +          .type  = GF_OPTION_TYPE_BOOL, +          .default_value = "False", +           .description = "Enable x-halo failover: will allow failover " +                          "to bricks outside the client or daemons' halo " +                          "in an attempt to satisfy halo-min-replicas." +        }, +        { .key   = {"halo-nfsd-max-latency"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "5", +          .description = "Maximum latency for nfsd halo replication in msec." +        }, +        { .key   = {"halo-max-latency"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "5", +           .description = "Maximum latency for halo replication in msec." +        }, +        { .key   = {"halo-hybrid-mode"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "Enable hybrid sync mounts.  When enabled, halo will " +                         "do write FOPs synchronously, and read FOPs will be " +                         "services in-region if the inode is clean/consistent." +                         "If no bricks can be found below " +                         "halo-hybrid-max-read-latency then the best 2 shall " +                         "be selected.  This option can be used in " +                         "conjunction with all other halo options." +        }, +        { .key   = {"halo-hybrid-read-max-latency"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "8", +           .description = "Maximum latency hybrid mode will use to select " +                          "children for read FOPs.  Don't tune this unless " +                          "you really know what you are doing (i.e. you've " +                          "read/understand the associated source code)." +        }, +        { .key   = {"halo-max-replicas"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "99999", +           .description = "The maximum number of halo replicas; replicas" +                          " beyond this value will be written asynchronously" +                          "via the SHD." +        }, +        { .key   = {"halo-min-replicas"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "2", +           .description = "The minimum number of halo replicas, before adding " +                          "out of region replicas." +        }, +        { .key   = {"halo-min-samples"}, +          .type  = GF_OPTION_TYPE_INT, +          .min   = 1, +          .max   = 99999, +          .default_value = "3", +           .description = "The minimum number of halo latency samples, before " +                          "we start forming the halos." +        },          { .key  = {"heal-wait-queue-length"},            .type = GF_OPTION_TYPE_INT,            .min  = 0, @@ -802,6 +953,13 @@ struct volume_options options[] = {                           "translator is running as part of self-heal-daemon "                           "or not."          }, +        { .key = {"iam-nfs-daemon"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "This option differentiates if the replicate " +                         "translator is running as part of an NFS daemon " +                         "or not." +        },          { .key = {"quorum-type"},            .type = GF_OPTION_TYPE_STR,            .value = { "none", "auto", "fixed"}, @@ -866,7 +1024,7 @@ struct volume_options options[] = {  	},          { .key  = {"heal-timeout"},            .type = GF_OPTION_TYPE_INT, -          .min  = 60, +          .min  = 5,            .max  = INT_MAX,            .default_value = "600",            .description = "time interval for checking the need to self-heal " diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 70c3e349743..aa19f1eeb37 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,9 @@  #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"  #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) +#define AFR_CHILD_DOWN_LATENCY INT64_MAX  /* Latency for down children */ +#define AFR_HALO_HYBRID_CHILD_LIMIT 2   /* Examine bricks <= 10 msec */ +#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0   /* Examine bricks <= 10 msec */  #define AFR_LOCKEE_COUNT_MAX    3  #define AFR_DOM_COUNT_MAX    3  #define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/ @@ -72,6 +75,17 @@ typedef enum {          AFR_FAV_CHILD_POLICY_MAX,  } afr_favorite_child_policy; +struct afr_nfsd { +        gf_boolean_t     iamnfsd; +        uint32_t         halo_max_latency_msec; +}; + +struct afr_child { +        uint32_t idx; +        int64_t latency; +        unsigned char child_up; +}; +  typedef struct _afr_private {          gf_lock_t lock;               /* to guard access to child_count, etc */          unsigned int child_count;     /* total number of children   */ @@ -83,6 +97,7 @@ typedef struct _afr_private {          inode_t *root_inode;          unsigned char *child_up; +        int64_t *child_latency;          unsigned char *local;          char **pending_key; @@ -153,8 +168,19 @@ typedef struct _afr_private {          gf_boolean_t           ensure_durability;          char                   *sh_domain;  	char                   *afr_dirty; - -	afr_self_heald_t       shd; +        gf_boolean_t           halo_enabled; + +        /* Halo geo-replication tunables */ +        gf_boolean_t           halo_failover_enabled; +        gf_boolean_t           halo_hybrid_mode; +        uint32_t               halo_hybrid_read_max_latency_msec; +        uint32_t               halo_max_latency_msec; +        uint32_t               halo_max_replicas; +        uint32_t               halo_min_replicas; +        uint32_t               halo_min_samples; + +        afr_self_heald_t       shd; +        struct afr_nfsd        nfsd;          gf_boolean_t           consistent_metadata;          uint64_t               spb_choice_timeout; diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/cluster/aha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am new file mode 100644 index 00000000000..006db127d28 --- /dev/null +++ b/xlators/cluster/aha/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = aha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +aha_la_LDFLAGS = -module -avoid-version + +aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c +aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ +	-I$(top_srcdir)/rpc/xdr/src \ +	-I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c new file mode 100644 index 00000000000..3b2ca641de2 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.c @@ -0,0 +1,952 @@ +#include "aha-fops.h" + +static void +__save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ +        list_add_tail (&fop->list, &conf->failed); +} + +void +save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ +        LOCK (&conf->lock); +        { +                __save_fop (fop, conf); +        } +        UNLOCK (&conf->lock); +} + +#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...)             \ +        do {                                                            \ +                struct aha_fop *fop = aha_fop_new ();                        \ +                if (!fop) {                                             \ +                        gf_log (GF_AHA, GF_LOG_CRITICAL,                \ +                                "Allocation failed, terminating "       \ +                                "to prevent a hung mount.");            \ +                        assert (0);                                     \ +                }                                                       \ +                fop->stub = fop_##type##_stub (frame, aha_##type,       \ +                                                args);                  \ +                fop->frame = frame;                                     \ +                frame->local = fop;                                     \ +                STACK_WIND (frame, cbk, obj, fn, args);                 \ +        } while (0)                                                     \ + +/* + * AHA_HANDLE_FOP_CBK + * + * 1) If the error returned is ENOTCONN *and* the timer that waits + *    for the server to come back has not expired, store the fop to retry later. + * 2) If the timer waiting for the server has expired, just unwind. + * 3) If the error returned is something other than ENOTCONN, just unwind. + * + */ +#define AHA_HANDLE_FOP_CBK(type, frame, args ...)                       \ +        do {                                                            \ +                struct aha_conf *conf = frame->this->private;                \ +                struct aha_fop *fop = frame->local;                          \ +                if (op_ret != 0 && op_errno == ENOTCONN &&              \ +                        !aha_is_timer_expired (conf)) {                 \ +                        gf_log (GF_AHA, GF_LOG_WARNING,                 \ +                                "Got ENOTCONN from client, storing "    \ +                                "to retry later!");                     \ +                        save_fop (fop, conf);                           \ +                } else {                                                \ +                        AHA_DESTROY_LOCAL (frame);                      \ +                        STACK_UNWIND_STRICT (type, frame, args);        \ +                }                                                       \ +        } while (0)                                                     \ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, inode_t *inode, +		      struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ +        AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode, +                                buf, xdata, postparent); +        return 0; +} + + +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, +                  dict_t *xdata) +{ +        AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk, +                        FIRST_CHILD (this), +		        FIRST_CHILD (this)->fops->lookup, +		        loc, xdata); +        return 0; +} + + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, struct iatt *buf, +                    dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata); +        return 0; +} + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, stat, aha_stat_cbk, +		        FIRST_CHILD (this), +		        FIRST_CHILD (this)->fops->stat, +		        loc, xdata); +        return 0; +} + + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +                       struct iatt *preop, struct iatt *postop, +                       dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop, +                            postop, xdata); +        return 0; +} + + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +                   struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk, +		        FIRST_CHILD (this), +		        FIRST_CHILD (this)->fops->setattr, +		        loc, stbuf, valid, xdata); +        return 0; +} + + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +                       struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop, +                            postop, xdata); +        return 0; +} + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                    struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk, +		        FIRST_CHILD (this), +		        FIRST_CHILD (this)->fops->fsetattr, +		        fd, stbuf, valid, xdata); +        return 0; +} + + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, +			struct iatt *prebuf, struct iatt *postbuf, +                        dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno, +                             prebuf, postbuf, xdata); +        return 0; +} + + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, +                    off_t offset, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->truncate, +		    loc, offset, xdata); +        return 0; +} + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			 int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                         struct iatt *postbuf, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno, +                             prebuf, postbuf, xdata); +        return 0; +} + + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, +		     off_t offset, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->ftruncate, +		    fd, offset, xdata); +        return 0; +} + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, +		  int32_t mask, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, access, aha_access_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->access, +		    loc, mask, xdata); +        return 0; +} + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, +			const char *path, struct iatt *sbuf, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno, +                            path, sbuf, xdata); +        return 0; +} + + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, +                    size_t size, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->readlink, +		    loc, size, xdata); +        return 0; +} + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, inode_t *inode, +                     struct iatt *buf, struct iatt *preparent, +                     struct iatt *postparent, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno, +                             inode, buf, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, +		 mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->mknod, +		    loc, mode, rdev, umask, xdata); +        return 0; +} + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, inode_t *inode, +                     struct iatt *buf, struct iatt *preparent, +                     struct iatt *postparent, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno, +                             inode, buf, +                             preparent, postparent, xdata); +        return 0; +} + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, +		 loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->mkdir, +		    loc, mode, umask, xdata); +        return 0; +} + + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +                      struct iatt *preparent, struct iatt *postparent, +                      dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +                  dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->unlink, +		    loc, xflag, xdata); +        return 0; +} + + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, +                     struct iatt *preparent, struct iatt *postparent, +                     dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, +                 dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->rmdir, +		    loc, flags, xdata); +        return 0; +} + + +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, inode_t *inode, +                       struct iatt *buf, struct iatt *preparent, +                       struct iatt *postparent, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, +		   loc_t *loc, mode_t umask, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->symlink, +		    linkpath, loc, umask, xdata); +        return 0; +} + + +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, struct iatt *buf, +                      struct iatt *preoldparent, struct iatt *postoldparent, +                      struct iatt *prenewparent, struct iatt *postnewparent, +                      dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf, +                             preoldparent, postoldparent, +                             prenewparent, postnewparent, xdata); +        return 0; +} + + +int +aha_rename (call_frame_t *frame, xlator_t *this, +		  loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, rename, aha_rename_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->rename, +		    oldloc, newloc, xdata); +        return 0; +} + + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, inode_t *inode, +                    struct iatt *buf, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_link (call_frame_t *frame, xlator_t *this, +		loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, link, aha_link_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->link, +		    oldloc, newloc, xdata); +        return 0; +} + + +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +		      fd_t *fd, inode_t *inode, struct iatt *buf, +                      struct iatt *preparent, struct iatt *postparent, +                      dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf, +                             preparent, postparent, xdata); +        return 0; +} + + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, +		  int32_t flags, mode_t mode, mode_t umask, fd_t *fd, +                  dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, create, aha_create_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->create, +		    loc, flags, mode, umask, fd, xdata); +        return 0; +} + + +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata); +        return 0; +} + + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, +		int32_t flags, fd_t *fd, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, open, aha_open_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->open, +		    loc, flags, fd, xdata); +        return 0; +} + +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, +		     struct iovec *vector, int32_t count, +		     struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno, +                             vector, count, stbuf, iobref, xdata); +        return 0; +} + +int +aha_readv (call_frame_t *frame, xlator_t *this, +		 fd_t *fd, size_t size, off_t offset, uint32_t flags, +                 dict_t *xdata) +{ +        AHA_HANDLE_FOP (frame, readv, aha_readv_cbk, +                        FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, +                        fd, size, offset, flags, xdata); +        return 0; +} + + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +                      struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ +        AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno, +                            prebuf, postbuf, xdata); +        return 0; +} + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +		  struct iovec *vector, int32_t count, +		  off_t off, uint32_t flags, struct iobref *iobref, +                  dict_t *xdata) +{ +        AHA_HANDLE_FOP (frame, writev, aha_writev_cbk, +                        FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, +                        fd, vector, count, off, flags, iobref, xdata); +        return 0; +} + + +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, flush, aha_flush_cbk, +                    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->flush, +		    fd, xdata); +        return 0; +} + + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, +		     xlator_t *this, int32_t op_ret, +		     int32_t op_errno, struct iatt *prebuf, +                     struct iatt *postbuf, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno, +                            prebuf, postbuf, xdata); +        return 0; +} + + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, +           dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fsync, +		    fd, flags, xdata); +        return 0; +} + + +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, struct iatt *buf, +               dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata); +        return 0; +} + + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fstat, +		    fd, xdata); +        return 0; +} + + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, fd_t *fd, +                       dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata); +        return 0; +} + + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, +             dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->opendir, +		    loc, fd, xdata); +        return 0; +} + +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, +                    int32_t flags, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fsyncdir, +		    fd, flags, xdata); +        return 0; +} + + +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int32_t op_ret, int32_t op_errno, struct statvfs *buf, +                dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata); +        return 0; +} + + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->statfs, +		    loc, xdata); +        return 0; +} + + + +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +		    dict_t *dict, int32_t flags, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->setxattr, +		    loc, dict, flags, xdata); +        return 0; +} + + +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, dict_t *dict, +                  dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +		    const char *name, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->getxattr, +		    loc, name, xdata); +        return 0; +} + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                     dict_t *dict, int32_t flags, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fsetxattr, +		    fd, dict, flags, xdata); +        return 0; +} + + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, dict_t *dict, +                   dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                     const char *name, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fgetxattr, +		    fd, name, xdata); +        return 0; +} + + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                 int32_t op_ret, int32_t op_errno, dict_t *dict, +                 dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, +		   gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->xattrop, +		    loc, flags, dict, xdata); +        return 0; +} + + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, dict_t *dict, +                  dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, +		    gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fxattrop, +		    fd, flags, dict, xdata); +        return 0; +} + + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +		       const char *name, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->removexattr, +		    loc, name, xdata); +        return 0; +} + +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                        const char *name, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fremovexattr, +		    fd, name, xdata); +        return 0; +} + + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +            int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +            dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata); +        return 0; +} + + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, +	      struct gf_flock *lock, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, lk, aha_lk_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->lk, +		    fd, cmd, lock, xdata); +        return 0; +} + + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, +		   const char *volume, loc_t *loc, int32_t cmd, +                   struct gf_flock *lock, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->inodelk, +		    volume, loc, cmd, lock, xdata); +        return 0; +} + + +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, +		    const char *volume, fd_t *fd, int32_t cmd, +                    struct gf_flock *lock, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->finodelk, +		    volume, fd, cmd, lock, xdata); +        return 0; +} + + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, +		   const char *volume, loc_t *loc, const char *basename, +		   entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->entrylk, +		    volume, loc, basename, cmd, type, xdata); +        return 0; +} + + +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata); +        return 0; +} + + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, +		    const char *volume, fd_t *fd, const char *basename, +		    entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fentrylk, +		    volume, fd, basename, cmd, type, xdata); +        return 0; +} + +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, +                       dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata); +	return 0; +} + + +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, +		   size_t size, off_t off, dict_t *xdata) +{ +	AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->readdir, +		    fd, size, off, xdata); +	return 0; +} + + +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, +                        dict_t *xdata) +{ +	AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata); +	return 0; +} + + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +                    off_t off, dict_t *dict) +{ +	AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk, +                    FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->readdirp, +                    fd, size, off, dict); +	return 0; +} diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h new file mode 100644 index 00000000000..b1fb9d38a80 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.h @@ -0,0 +1,360 @@ +#ifndef _AHA_FOPS_H +#define _AHA_FOPS_H + +#include "aha.h" +#include "aha-helpers.h" + +/* FOP functions */ +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +                struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, +                off_t offset, dict_t *xdata); + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, +                off_t offset, dict_t *xdata); + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, +                dict_t *xdata); + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, +                dict_t *xdata); + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +                dev_t rdev, mode_t umask, dict_t *xdata); + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +                mode_t umask, dict_t *xdata); + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +                dict_t *xdata); + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, +                dict_t *xdata); + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, +                loc_t *loc, mode_t umask, dict_t *xdata); + +int +aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, +                dict_t *xdata); + +int +aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, +                dict_t *xdata); + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +                mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +                fd_t *fd, dict_t *xdata); + +int +aha_readv (call_frame_t *frame, xlator_t *this, +		 fd_t *fd, size_t size, off_t offset, uint32_t flags, +                 dict_t *xdata); + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, +                int32_t count, off_t off, uint32_t flags, +                struct iobref *iobref, dict_t *xdata); + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +                int32_t flags, dict_t *xdata); + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, +                dict_t *xdata); + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, +                dict_t *xdata); + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, +                int32_t flags, dict_t *xdata); + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +                const char *name, dict_t *xdata); + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                dict_t *dict, int32_t flags, dict_t *xdata); + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                const char *name, dict_t *xdata); + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, +                gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, +                gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +                        const char *name, dict_t *xdata); + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +                        const char *name, dict_t *xdata); + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, +        struct gf_flock *lock, dict_t *xdata); + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, +                loc_t *loc, int32_t cmd, struct gf_flock *lock, +                dict_t *xdata); + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, +		    const char *volume, fd_t *fd, int32_t cmd, +                    struct gf_flock *lock, dict_t *xdata); + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, +		   const char *volume, loc_t *loc, const char *basename, +		   entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, +		    const char *volume, fd_t *fd, const char *basename, +		    entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +                off_t off, dict_t *xdata); + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +                off_t off, dict_t *dict); + +/* Callback functions */ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, inode_t *inode, +		      struct iatt *buf, dict_t *xdata, struct iatt *postparent); + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, struct iatt *buf, +                    dict_t *xdata); + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +                       struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +                       struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, +			struct iatt *prebuf, struct iatt *postbuf, +                        dict_t *xdata); + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			 int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                         struct iatt *postbuf, dict_t *xdata); + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, dict_t *xdata); + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, +			const char *path, struct iatt *sbuf, dict_t *xdata); + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, inode_t *inode, +                     struct iatt *buf, struct iatt *preparent, +                     struct iatt *postparent, dict_t *xdata); + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, inode_t *inode, +                     struct iatt *buf, struct iatt *preparent, +                     struct iatt *postparent, dict_t *xdata); + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +                      struct iatt *preparent, struct iatt *postparent, +                      dict_t *xdata); + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, +                     struct iatt *preparent, struct iatt *postparent, +                     dict_t *xdata); +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, inode_t *inode, +                       struct iatt *buf, struct iatt *preparent, +                       struct iatt *postparent, dict_t *xdata); +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, struct iatt *buf, +                      struct iatt *preoldparent, struct iatt *postoldparent, +                      struct iatt *prenewparent, struct iatt *postnewparent, +                      dict_t *xdata); + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, inode_t *inode, +                    struct iatt *buf, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata); +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +		      fd_t *fd, inode_t *inode, struct iatt *buf, +                      struct iatt *preparent, struct iatt *postparent, +                      dict_t *xdata); +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		    int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata); +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, +		     struct iovec *vector, int32_t count, +		     struct iatt *stbuf, struct iobref *iobref, dict_t *xdata); + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, +                      struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata); +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, +		     xlator_t *this, int32_t op_ret, +		     int32_t op_errno, struct iatt *prebuf, +                     struct iatt *postbuf, dict_t *xdata); +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, struct iatt *buf, +                     dict_t *xdata); + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, fd_t *fd, +                       dict_t *xdata); +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, struct statvfs *buf, +                      dict_t *xdata); +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *dict, +                        dict_t *xdata); + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, dict_t *dict, +                         dict_t *xdata); + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *dict, +                       dict_t *xdata); + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno, dict_t *dict, +                        dict_t *xdata); + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                  dict_t *xdata); + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, +                       dict_t *xdata); +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, +                        dict_t *xdata); + +#endif /* _AHA_FOPS_H */ diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c new file mode 100644 index 00000000000..e3b713688d3 --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.c @@ -0,0 +1,46 @@ +#include "aha-helpers.h" + +struct aha_conf *aha_conf_new () +{ +        struct aha_conf *conf = NULL; + +        conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf); +        if (!conf) +                goto err; + +        INIT_LIST_HEAD (&conf->failed); + +        LOCK_INIT (&conf->lock); +err: +        return conf; +} + +void aha_conf_destroy (struct aha_conf *conf) +{ +        LOCK_DESTROY (&conf->lock); +        GF_FREE (conf); +} + +struct aha_fop *aha_fop_new () +{ +        struct aha_fop *fop = NULL; + +        fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop); +        if (!fop) +                goto err; + +        INIT_LIST_HEAD (&fop->list); + +err: +        return fop; +} + +void aha_fop_destroy (struct aha_fop *fop) +{ +        if (!fop) +                return; + +        call_stub_destroy (fop->stub); +        fop->stub = NULL; +        GF_FREE (fop); +} diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h new file mode 100644 index 00000000000..d9cf9b3295d --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.h @@ -0,0 +1,23 @@ +#ifndef _AHA_HELPERS_H +#define _AHA_HELPERS_H + +#include "aha.h" + +#define GF_AHA "aha" + +struct aha_conf *aha_conf_new (); + +void aha_conf_destroy (struct aha_conf *conf); + +struct aha_fop *aha_fop_new (); + +void aha_fop_destroy (struct aha_fop *fop); + +#define AHA_DESTROY_LOCAL(frame)                        \ +        do {                                            \ +                struct aha_fop *fop = frame->local;          \ +                aha_fop_destroy (fop);                  \ +                frame->local = NULL;                    \ +        } while (0)                                     \ + +#endif /* _AHA_HELPERS_H */ diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h new file mode 100644 index 00000000000..117dda27e8b --- /dev/null +++ b/xlators/cluster/aha/src/aha-mem-types.h @@ -0,0 +1,22 @@ +/* +   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +#ifndef __AHA_MEM_TYPES_H__ +#define __AHA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_aha_mem_types_ { +        gf_aha_mt_begin_t = gf_common_mt_end + 1, +        gf_aha_mt_conf, +        gf_aha_mt_fop, +        gf_aha_mt_end +}; +#endif diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c new file mode 100644 index 00000000000..8810f913f42 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.c @@ -0,0 +1,524 @@ +#include "aha.h" +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" + +/* + * AHA_RETRY_FOP: + * + * - We STACK_WIND the fop using the arguments in the call_stub. + *   We use STACK_WIND because we need a *new* frame, since we already + *   exhausted the existing frame with the original STACK_WIND. + * + * - After STACK_WIND completes, we can destroy this frame's local (which + *   should be struct aha_fop *). The frame itself will get destroyed higher in + *   the xlator graph, since its still part of the call stack. + */ +#define AHA_RETRY_FOP(fop, type, args ...)                              \ +        do {                                                            \ +                call_stub_t *stub = fop->stub;                          \ +                call_frame_t *frame = fop->frame;                       \ +                xlator_t *this = frame->this;                           \ +                STACK_WIND (frame, aha_##type##_cbk, this,              \ +                            this->fops->type, args);                    \ +                AHA_DESTROY_LOCAL (frame);                              \ +        } while (0)                                                     \ + +#define AHA_UNWIND_FOP(fop, type)                                       \ +        do {                                                            \ +                call_frame_t *frame = fop->frame;                       \ +                AHA_DESTROY_LOCAL (frame);                              \ +                default_##type##_failure_cbk (frame, ETIMEDOUT);        \ +        } while (0)                                                     \ + +void +__aha_retry_force_unwind_fops (struct aha_conf *conf) +{ +        struct aha_fop *fop = NULL; +        struct aha_fop *tmp = NULL; +        size_t  ndrained = 0; + +        /* +         * Drain the queue. After we finish the loop, the list +         * must be empty. +         */ +        list_for_each_entry_safe (fop, tmp, &conf->failed, list) { +                list_del (&fop->list); +                aha_force_unwind_fop (fop); +                ndrained++; +        } + +        gf_log (GF_AHA, GF_LOG_WARNING, +                "Force-unwound %"GF_PRI_SIZET" fops!", ndrained); + +        assert (list_empty (&conf->failed)); +} + +void +aha_force_unwind_fops (struct aha_conf *conf) +{ +        LOCK (&conf->lock); +        { +                __aha_retry_force_unwind_fops (conf); +        } +        UNLOCK (&conf->lock); +} + +void +__aha_retry_failed_fops (struct aha_conf *conf) +{ +        struct aha_fop *fop = NULL; +        struct aha_fop *tmp = NULL; +        size_t ndrained = 0; + +        /* +         * Skip if the child is not up +         */ +        if (!conf->child_up) { +                gf_log (GF_AHA, GF_LOG_WARNING, +                        "Waiting for child to come up before retrying."); +                return; +        } + +        /* +         * Skip if the the queue is empty. +         */ +        if (list_empty (&conf->failed)) { +                gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry."); +        } + +        /* +         * Drain the queue. After we finish the loop, the list +         * must be empty. +         */ +        list_for_each_entry_safe (fop, tmp, &conf->failed, list) { +                list_del (&fop->list); +                aha_retry_fop (fop); +                ndrained++; +        } + +        gf_log (GF_AHA, GF_LOG_WARNING, +                "Drained %"GF_PRI_SIZET" fops!", ndrained); + +        assert (list_empty (&conf->failed)); +} + + +void +aha_retry_failed_fops (struct aha_conf *conf) +{ +        LOCK (&conf->lock); +        { +                __aha_retry_failed_fops (conf); +        } +        UNLOCK (&conf->lock); +} + +void aha_retry_fop (struct aha_fop *fop) +{ +        call_stub_t *stub = fop->stub; + +        switch (stub->fop) { +        case GF_FOP_OPEN: +                AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags, +                                stub->args.fd, stub->args.xdata); +                break; + +        case GF_FOP_CREATE: +                AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags, +                                stub->args.mode, stub->args.umask, +                                stub->args.fd, +                                stub->args.xdata); +                break; + +        case GF_FOP_STAT: +                AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata); +                break; + +        case GF_FOP_READLINK: +                AHA_RETRY_FOP (fop, readlink, &stub->args.loc, +                                stub->args.size, stub->args.xdata); +                break; + +        case GF_FOP_MKNOD: +                AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode, +                                stub->args.rdev, stub->args.umask, +                                stub->args.xdata); +	        break; + +        case GF_FOP_MKDIR: +                AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode, +                                stub->args.umask, stub->args.xdata); +                break; + +        case GF_FOP_UNLINK: +                AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag, +                                stub->args.xdata); +                break; + +        case GF_FOP_RMDIR: +                AHA_RETRY_FOP (fop, rmdir, &stub->args.loc, +                                stub->args.flags, stub->args.xdata); +                break; + +        case GF_FOP_SYMLINK: +                AHA_RETRY_FOP (fop, symlink, stub->args.linkname, +                                &stub->args.loc, stub->args.umask, +                                stub->args.xdata); +                break; + +        case GF_FOP_RENAME: +                AHA_RETRY_FOP (fop, rename, &stub->args.loc, +                                &stub->args.loc2, stub->args.xdata); +                break; + +        case GF_FOP_LINK: +                AHA_RETRY_FOP (fop, link, &stub->args.loc, +                                &stub->args.loc2, stub->args.xdata); +                break; + +        case GF_FOP_TRUNCATE: +                AHA_RETRY_FOP (fop, truncate, &stub->args.loc, +                                stub->args.offset, stub->args.xdata); +                break; + +        case GF_FOP_READ: +                AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size, +                                stub->args.offset, stub->args.flags, +                                stub->args.xdata); +                break; + +        case GF_FOP_WRITE: +                AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector, +                                stub->args.count, stub->args.offset, +                                stub->args.flags, stub->args.iobref, +                                stub->args.xdata); +                break; + +        case GF_FOP_STATFS: +                AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata); +                break; + +        case GF_FOP_FLUSH: +                AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata); +                break; + +        case GF_FOP_FSYNC: +                AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync, +                                stub->args.xdata); +                break; + +        case GF_FOP_SETXATTR: +                AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr, +		                stub->args.flags, stub->args.xdata); +                break; + +        case GF_FOP_GETXATTR: +                AHA_RETRY_FOP (fop, getxattr, &stub->args.loc, +                                stub->args.name, stub->args.xdata); +                break; + +        case GF_FOP_FSETXATTR: +                AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd, +                                stub->args.xattr, stub->args.flags, +                                stub->args.xdata); +                break; + +        case GF_FOP_FGETXATTR: +                AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd, +                                stub->args.name, stub->args.xdata); +                break; + +        case GF_FOP_REMOVEXATTR: +                AHA_RETRY_FOP (fop, removexattr, &stub->args.loc, +                                stub->args.name, stub->args.xdata); +                break; + +        case GF_FOP_FREMOVEXATTR: +                AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd, +                                stub->args.name, stub->args.xdata); +                break; + +        case GF_FOP_OPENDIR: +                AHA_RETRY_FOP (fop, opendir, &stub->args.loc, +                                stub->args.fd, stub->args.xdata); +                break; + +        case GF_FOP_FSYNCDIR: +                AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd, +                                stub->args.datasync, stub->args.xdata); +                break; + +        case GF_FOP_ACCESS: +                AHA_RETRY_FOP (fop, access, &stub->args.loc, +                                stub->args.mask, stub->args.xdata); +                break; + +        case GF_FOP_FTRUNCATE: +                AHA_RETRY_FOP (fop, ftruncate, stub->args.fd, +                                stub->args.offset, stub->args.xdata); +                break; + +        case GF_FOP_FSTAT: +                AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata); +                break; + +        case GF_FOP_LK: +                AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd, +                                &stub->args.lock, stub->args.xdata); +                break; + +        case GF_FOP_INODELK: +                AHA_RETRY_FOP (fop, inodelk, stub->args.volume, +                                &stub->args.loc, stub->args.cmd, +                                &stub->args.lock, stub->args.xdata); +                break; + +        case GF_FOP_FINODELK: +                AHA_RETRY_FOP (fop, finodelk, stub->args.volume, +                                stub->args.fd, stub->args.cmd, +                                &stub->args.lock, stub->args.xdata); +                break; + +        case GF_FOP_ENTRYLK: +                AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc, +	                        stub->args.name, stub->args.entrylkcmd, +		                stub->args.entrylktype, stub->args.xdata); +                break; + +        case GF_FOP_FENTRYLK: +                AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd, +                                stub->args.name, stub->args.entrylkcmd, +                                stub->args.entrylktype, stub->args.xdata); +                break; + +        case GF_FOP_LOOKUP: +                AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata); +                break; + +        case GF_FOP_READDIR: +                AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size, +                                stub->args.offset, stub->args.xdata); +                break; + +        case GF_FOP_READDIRP: +                AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size, +                                stub->args.offset, stub->args.xdata); +                break; + +        case GF_FOP_XATTROP: +                AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype, +                                stub->args.xattr, stub->args.xdata); +                break; + +        case GF_FOP_FXATTROP: +                AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype, +                                stub->args.xattr, stub->args.xdata); +                break; + +        case GF_FOP_SETATTR: +                AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat, +                                stub->args.valid, stub->args.xdata); +                break; + +        case GF_FOP_FSETATTR: +                AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat, +                                stub->args.valid, stub->args.xdata); +                break; + +        default: +                /* Some fops are not implemented yet: +                 * +                 * GF_FOP_NULL +                 * GF_FOP_RCHECKSUM +                 * GF_FOP_FORGET +                 * GF_FOP_RELEASE +                 * GF_FOP_RELEASEDIR +                 * GF_FOP_GETSPEC +                 * GF_FOP_FALLOCATE +                 * GF_FOP_DISCARD +                 * GF_FOP_ZEROFILL +                 * GF_FOP_MAXVALUE +                 * +                 */ +                gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s", +                        gf_fop_list[stub->fop]); +                assert (0); +                break; +        } +} + +void +aha_force_unwind_fop (struct aha_fop *fop) +{ +        call_stub_t *stub = fop->stub; + +        switch (stub->fop) { +        case GF_FOP_OPEN: +                AHA_UNWIND_FOP (fop, open); +                break; + +        case GF_FOP_CREATE: +                AHA_UNWIND_FOP (fop, create); +                break; + +        case GF_FOP_STAT: +                AHA_UNWIND_FOP (fop, stat); +                break; + +        case GF_FOP_READLINK: +                AHA_UNWIND_FOP (fop, readlink); +                break; + +        case GF_FOP_MKNOD: +                AHA_UNWIND_FOP (fop, mknod); +	        break; + +        case GF_FOP_MKDIR: +                AHA_UNWIND_FOP (fop, mkdir); +                break; + +        case GF_FOP_UNLINK: +                AHA_UNWIND_FOP (fop, unlink); +                break; + +        case GF_FOP_RMDIR: +                AHA_UNWIND_FOP (fop, rmdir); +                break; + +        case GF_FOP_SYMLINK: +                AHA_UNWIND_FOP (fop, symlink); +                break; + +        case GF_FOP_RENAME: +                AHA_UNWIND_FOP (fop, rename); +                break; + +        case GF_FOP_LINK: +                AHA_UNWIND_FOP (fop, link); +                break; + +        case GF_FOP_TRUNCATE: +                AHA_UNWIND_FOP (fop, truncate); +                break; + +        case GF_FOP_READ: +                AHA_UNWIND_FOP (fop, readv); +                break; + +        case GF_FOP_WRITE: +                AHA_UNWIND_FOP (fop, writev); +                break; + +        case GF_FOP_STATFS: +                AHA_UNWIND_FOP (fop, statfs); +                break; + +        case GF_FOP_FLUSH: +                AHA_UNWIND_FOP (fop, flush); +                break; + +        case GF_FOP_FSYNC: +                AHA_UNWIND_FOP (fop, fsync); +                break; + +        case GF_FOP_SETXATTR: +                AHA_UNWIND_FOP (fop, setxattr); +                break; + +        case GF_FOP_GETXATTR: +                AHA_UNWIND_FOP (fop, getxattr); +                break; + +        case GF_FOP_FSETXATTR: +                AHA_UNWIND_FOP (fop, fsetxattr); +                break; + +        case GF_FOP_FGETXATTR: +                AHA_UNWIND_FOP (fop, fgetxattr); +                break; + +        case GF_FOP_REMOVEXATTR: +                AHA_UNWIND_FOP (fop, removexattr); +                break; + +        case GF_FOP_FREMOVEXATTR: +                AHA_UNWIND_FOP (fop, fremovexattr); +                break; + +        case GF_FOP_OPENDIR: +                AHA_UNWIND_FOP (fop, opendir); +                break; + +        case GF_FOP_FSYNCDIR: +                AHA_UNWIND_FOP (fop, fsyncdir); +                break; + +        case GF_FOP_ACCESS: +                AHA_UNWIND_FOP (fop, access); +                break; + +        case GF_FOP_FTRUNCATE: +                AHA_UNWIND_FOP (fop, ftruncate); +                break; + +        case GF_FOP_FSTAT: +                AHA_UNWIND_FOP (fop, fstat); +                break; + +        case GF_FOP_LK: +                AHA_UNWIND_FOP (fop, lk); +                break; + +        case GF_FOP_INODELK: +                AHA_UNWIND_FOP (fop, inodelk); +                break; + +        case GF_FOP_FINODELK: +                AHA_UNWIND_FOP (fop, finodelk); +                break; + +        case GF_FOP_ENTRYLK: +                AHA_UNWIND_FOP (fop, entrylk); +                break; + +        case GF_FOP_FENTRYLK: +                AHA_UNWIND_FOP (fop, fentrylk); +                break; + +        case GF_FOP_LOOKUP: +                AHA_UNWIND_FOP (fop, lookup); +                break; + +        case GF_FOP_READDIR: +                AHA_UNWIND_FOP (fop, readdir); +                break; + +        case GF_FOP_READDIRP: +                AHA_UNWIND_FOP (fop, readdirp); +                break; + +        case GF_FOP_XATTROP: +                AHA_UNWIND_FOP (fop, xattrop); +                break; + +        case GF_FOP_FXATTROP: +                AHA_UNWIND_FOP (fop, fxattrop); +                break; + +        case GF_FOP_SETATTR: +                AHA_UNWIND_FOP (fop, setattr); +                break; + +        case GF_FOP_FSETATTR: +                AHA_UNWIND_FOP (fop, fsetattr); +                break; + +        default: +                /* Some fops are not implemented yet, +                 * and this would never happen cause we wouldn't +                 * queue them (see the assert statement in aha_retry_fop()) +                 */ +                break; +        } +} diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h new file mode 100644 index 00000000000..5c8f56bca97 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.h @@ -0,0 +1,12 @@ +#ifndef _AHA_RETRY_H +#define _AHA_RETRY_H + +void aha_retry_failed_fops (struct aha_conf *conf); + +void aha_retry_fop (struct aha_fop *fop); + +void aha_force_unwind_fops (struct aha_conf *conf); + +void aha_force_unwind_fop (struct aha_fop *fop); + +#endif /* _AHA_RETRY_H */ diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c new file mode 100644 index 00000000000..2135e47f37f --- /dev/null +++ b/xlators/cluster/aha/src/aha.c @@ -0,0 +1,345 @@ +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" +#include "aha.h" + +#include "syncop.h" + + +int +retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg) +{ +        /* Nothing to do here ... */ +        return 0; +} + +int +retry_failed_fops (void *arg) +{ +        xlator_t *this = NULL; + +        struct aha_conf *conf = NULL; + +        this = arg; +        conf = this->private; + +        aha_retry_failed_fops (conf); + +        return 0; +} + +void +dispatch_fop_queue_drain (xlator_t *this) +{ +        struct syncenv *env = NULL; +        int ret = 0; + +        env = this->ctx->env; + +        ret = synctask_new (env, retry_failed_fops, +                                retry_failed_fops_cbk, NULL, this); +        if (ret != 0) { +                gf_log (GF_AHA, GF_LOG_CRITICAL, +                        "Failed to dispatch synctask " +                        "to drain fop queue!"); +        } +} + +inline void +__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired) +{ +        conf->timer_expired = expired; +} + +inline gf_boolean_t +__aha_is_timer_expired (struct aha_conf *conf) +{ +        return conf->timer_expired; +} + +gf_boolean_t +aha_is_timer_expired (struct aha_conf *conf) +{ +        gf_boolean_t expired = _gf_false; + +        LOCK (&conf->lock); +        { +                expired = __aha_is_timer_expired (conf); +        } +        UNLOCK (&conf->lock); + +        return expired; +} + +void +aha_child_down_timer_expired (void *data) +{ +        struct aha_conf *conf = NULL; + +        conf = data; + +        gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!"); + +        LOCK (&conf->lock); +        { +                __aha_set_timer_status (conf, _gf_true); +        } +        UNLOCK (&conf->lock); + +        aha_force_unwind_fops ((struct aha_conf *)data); +} + +void +__aha_start_timer (struct aha_conf *conf) +{ +        struct timespec child_down_timeout = { +                .tv_sec = conf->server_wait_timeout, +                .tv_nsec = 0 +        }; + +        __aha_set_timer_status (conf, _gf_false); + +        conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout, +                                           aha_child_down_timer_expired, conf); +        if (!conf->timer) { +                gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!"); +        } + +        gf_log (GF_AHA, GF_LOG_INFO, +                "Registered timer for %lu seconds.", +                conf->server_wait_timeout); +} + +void +__aha_cancel_timer (struct aha_conf *conf) +{ +        if (!conf->timer) +                goto out; + +        gf_timer_call_cancel (conf->this->ctx, conf->timer); +        conf->timer = NULL; +        gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!"); +out: +        return; +} + +void +__aha_update_child_status (struct aha_conf *conf, int status) +{ +        conf->child_up = status; +} + +void +aha_handle_child_up (xlator_t *this) +{ +        struct aha_conf *conf = this->private; + +        LOCK (&conf->lock); +        { +                __aha_update_child_status ( +                    conf, AHA_CHILD_STATUS_UP);  /* Mark the child as up */ +                __aha_set_timer_status ( +                    conf, _gf_false);       /* Timer is no longer expired */ +                __aha_cancel_timer (conf);      /* Cancel the timer */ +        } +        UNLOCK (&conf->lock); +} + +void +aha_handle_child_down (xlator_t *this) +{ +        struct aha_conf *conf = this->private; + +        LOCK (&conf->lock); +        { +                __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN); +                __aha_set_timer_status (conf, _gf_true); +                __aha_start_timer (conf); +        } +        UNLOCK (&conf->lock); +} + +int32_t +notify (xlator_t *this, int32_t event, void *data, ...) +{ +        switch (event) { +        case GF_EVENT_CHILD_DOWN: +                gf_log (this->name, GF_LOG_WARNING, "Got child-down event!"); +                aha_handle_child_down (this); +                break; +        case GF_EVENT_CHILD_UP: +                gf_log (this->name, GF_LOG_WARNING, "Got child-up event!"); +                aha_handle_child_up (this); +                dispatch_fop_queue_drain (this); +                break; +        default: +                break; +        } + +        default_notify (this, event, data); + +        return 0; +} + +int32_t +aha_priv_dump (xlator_t *this) +{ +        return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ +        int ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1); + +        if (ret != 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Memory accounting init failed!"); +                return ret; +        } + +        return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ +        struct aha_conf *conf = NULL; + +        conf = this->private; + +        GF_OPTION_RECONF ("server-wait-timeout-seconds", +                                conf->server_wait_timeout, +                                options, size_uint64, err); + +        return 0; +err: +        return -1; +} + +int +aha_init_options (xlator_t *this) +{ +        struct aha_conf *conf = NULL; + +        conf = this->private; + +        GF_OPTION_INIT ("server-wait-timeout-seconds", +                        conf->server_wait_timeout, +                        size_uint64, err); + +        return 0; +err: +        return -1; +} + + +int +init (xlator_t *this) +{ +        int ret = 0; +        struct aha_conf *conf = NULL; + +        conf = aha_conf_new (); +        if (!conf) { +                ret = -(ENOMEM); +                goto err; +        } + +        conf->this = this; +        this->private = conf; + +        aha_init_options (this); + +        /* init() completed successfully */ +        goto done; +err: +        gf_log (GF_AHA, GF_LOG_ERROR, +                "init() failed, please see " +                "logs for details."); + +        /* Free all allocated memory */ +        aha_conf_destroy (conf); +done: +        return ret; +} + +void +fini (xlator_t *this) +{ +        struct aha_conf *conf = this->private; + +        aha_conf_destroy (conf); + +        this->private = NULL; +} + +struct xlator_dumpops dumpops = { +        .priv = aha_priv_dump, +}; + +struct xlator_fops cbks; + +struct xlator_fops fops = { +    .lookup      = aha_lookup, +    .stat        = aha_stat, +    .readlink    = aha_readlink, +    .mknod       = aha_mknod, +    .mkdir       = aha_mkdir, +    .unlink      = aha_unlink, +    .rmdir       = aha_rmdir, +    .symlink     = aha_symlink, +    .rename      = aha_rename, +    .link        = aha_link, +    .truncate    = aha_truncate, +    .create      = aha_create, +    .open        = aha_open, +    .readv       = aha_readv, +    .writev      = aha_writev, +    .statfs      = aha_statfs, +    .flush       = aha_flush, +    .fsync       = aha_fsync, +    .setxattr    = aha_setxattr, +    .getxattr    = aha_getxattr, +    .removexattr = aha_removexattr, +    .fsetxattr    = aha_fsetxattr, +    .fgetxattr    = aha_fgetxattr, +    .fremovexattr = aha_fremovexattr, +    .opendir     = aha_opendir, +    .readdir     = aha_readdir, +    .readdirp    = aha_readdirp, +    .fsyncdir    = aha_fsyncdir, +    .access      = aha_access, +    .ftruncate   = aha_ftruncate, +    .fstat       = aha_fstat, +    .lk          = aha_lk, +    .lookup_cbk  = aha_lookup_cbk, +    .xattrop     = aha_xattrop, +    .fxattrop    = aha_fxattrop, +    .inodelk     = aha_inodelk, +    .finodelk    = aha_finodelk, +    .entrylk     = aha_entrylk, +    .fentrylk    = aha_fentrylk, +    .setattr     = aha_setattr, +    .fsetattr    = aha_fsetattr, +}; + +struct volume_options options[] = { +        { .key = {"server-wait-timeout-seconds"}, +          .type = GF_OPTION_TYPE_SIZET, +          .min = 10, +          .max = 20 * 60, +          .default_value = TOSTRING (120), +          .description = "Specifies the number of seconds the " +                         "AHA translator will wait " +                         "for a CHILD_UP event before " +                         "force-unwinding the frames it has " +                         "currently stored for retry." +        }, +        { .key  = {NULL} } +}; diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h new file mode 100644 index 00000000000..3dbf3199776 --- /dev/null +++ b/xlators/cluster/aha/src/aha.h @@ -0,0 +1,46 @@ +#ifndef _AHA_H +#define _AHA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "statedump.h" +#include "call-stub.h" +#include "defaults.h" +#include "list.h" +#include "timer.h" + +#include "aha-mem-types.h" + +/* new() and destroy() functions for all structs can be found in + * aha-helpers.c + */ +struct aha_conf { +        xlator_t *this; +        uint8_t child_up; +        gf_lock_t lock; +        struct list_head failed; +        gf_timer_t *timer; +        gf_boolean_t timer_expired; +        uint64_t server_wait_timeout; +}; + +struct aha_fop { +        call_stub_t *stub;      /* Only used to store function arguments */ +        call_frame_t *frame;    /* Frame corresponding to this fop */ +        uint64_t tries; +        struct list_head list; +}; + +enum { +        AHA_CHILD_STATUS_DOWN = 0, +        AHA_CHILD_STATUS_UP = 1, +        AHA_CHILD_STATUS_MAX +}; + +gf_boolean_t aha_is_timer_expired (struct aha_conf *conf); + +#endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index a9714b02b79..a97d03bb055 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -5559,6 +5559,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,  {          dht_local_t     *local          = NULL;          xlator_t        *avail_subvol   = NULL; +        int             op_errno        = 0;          local = frame->local; @@ -5571,9 +5572,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,                                     subvol, subvol->fops->mknod, loc, mode,                                     rdev, umask, params);          } else { -                avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - -                if (avail_subvol != subvol) { +                /* This will return NULL if all subvolumes are full +                 * and/or no subvolume needs the min_free_disk limit +                 */ +                avail_subvol = dht_free_disk_available_subvol (this, subvol, +                                                                local); +                if (!avail_subvol) { +                        op_errno = ENOSPC; +                        goto err; +                } else if (avail_subvol != subvol) {                          local->params = dict_ref (params);                          local->rdev = rdev;                          local->mode = mode; @@ -5603,6 +5610,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,          }  out:          return 0; +err: +        return op_errno;  }  int32_t @@ -6242,8 +6251,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this,              }          } -        dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode, -                                        umask, params); +        op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, +                                                        rdev, mode, umask, +                                                        params); +        if (op_errno != 0) { +                goto err; +        }  done:          return 0; @@ -6738,6 +6751,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,  {          dht_local_t     *local          = NULL;          xlator_t        *avail_subvol   = NULL; +        int             op_errno        = 0;          local = frame->local; @@ -6752,8 +6766,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,          } else {                  avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - -                if (avail_subvol != subvol) { +                if (!avail_subvol) { +                        op_errno = ENOSPC; +                        goto err; +                } else if (avail_subvol != subvol) {                          local->params = dict_ref (params);                          local->flags = flags;                          local->mode = mode; @@ -6780,6 +6796,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,          }  out:          return 0; +err: +        DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, +                          NULL, NULL, NULL); +        return op_errno;  }  int @@ -6882,9 +6902,10 @@ dht_create_do (call_frame_t *frame)                  goto err;          } -        dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, -                                         local->flags, local->mode, -                                         local->umask, local->fd, local->params); +        dht_create_wind_to_avail_subvol (frame, this, subvol, +                                                &local->loc, local->flags, +                                                local->mode, local->umask, +                                                local->fd, local->params);          return 0;  err:          local->refresh_layout_unlock (frame, this, -1, 1); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9e9ca712417..613a9d39816 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -300,6 +300,7 @@ struct dht_du {          uint64_t avail_space;          uint32_t log;          uint32_t chunks; +        gf_boolean_t is_full;  };  typedef struct dht_du dht_du_t; @@ -484,6 +485,7 @@ struct dht_conf {          dht_du_t      *du_stats;          double         min_free_disk;          double         min_free_inodes; +        gf_boolean_t   min_free_strict_mode;          char           disk_unit;          int32_t        refresh_interval;          gf_boolean_t   unhashed_sticky_bit; @@ -549,6 +551,10 @@ struct dht_conf {          gf_boolean_t    lock_migration_enabled;          gf_lock_t       lock; + +        /* du stats */ +        uint32_t       du_refresh_interval_sec; +        gf_lock_t      du_refresh_lock;  };  typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 1eb9e63c531..1b20dabc61f 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)  	call_frame_t  *statfs_frame = NULL;  	dht_local_t   *statfs_local = NULL;  	struct timeval tv           = {0,}; +	struct timeval cmp_tv       = {0,};          loc_t          tmp_loc      = {0,};  	conf  = this->private; +        /* Somebody else is already refreshing the statfs info */ +        if (TRY_LOCK (&conf->du_refresh_lock) != 0) +                return 0; +  	gettimeofday (&tv, NULL); +        cmp_tv = conf->last_stat_fetch; +        cmp_tv.tv_sec += conf->du_refresh_interval_sec; +          /* make it root gfid, should be enough to get the proper             info back */          tmp_loc.gfid[15] = 1; -	if (tv.tv_sec > (conf->refresh_interval -			 + conf->last_stat_fetch.tv_sec)) { - +	if (timercmp (&tv, &cmp_tv, >)) {  		statfs_frame = copy_frame (frame);  		if (!statfs_frame) {  			goto err; @@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)  				    &tmp_loc, statfs_local->params);  		} -		conf->last_stat_fetch.tv_sec = tv.tv_sec; +		conf->last_stat_fetch = tv;  	} -	return 0; +        ret = 0; +        goto out;  err:  	if (statfs_frame)  		DHT_STACK_DESTROY (statfs_frame); -	return -1; +        ret = -1; +out: +        UNLOCK (&conf->du_refresh_lock); +        return ret;  } @@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)  	conf = this->private;  	/* Check for values above specified percent or free disk */ -	LOCK (&conf->subvolume_lock); -	{ +	if (TRY_LOCK (&conf->subvolume_lock) != 0) { +		for (i = 0; i < conf->subvolume_cnt; i++) { +			if (subvol == conf->subvolumes[i]) { +                                return conf->du_stats[i].is_full; +                        } +                } +        } else {  		for (i = 0; i < conf->subvolume_cnt; i++) {  			if (subvol == conf->subvolumes[i]) {  				if (conf->disk_unit == 'p') { @@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)  				}  			}  		} -	} + +	        /* i will be less than subvolume_cnt if either of +                 * these booleans are true */ +                is_subvol_filled = ( +                    subvol_filled_space || subvol_filled_inodes); +                if (is_subvol_filled) { +                        conf->du_stats[i].is_full = is_subvol_filled; +                } +        }  	UNLOCK (&conf->subvolume_lock);  	if (subvol_filled_space && conf->subvolume_status[i]) { @@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)  		}  	} -	is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); -  	return is_subvol_filled;  } @@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,          LOCK (&conf->subvolume_lock);  	{ -                avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, +                avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,                                                                   layout); -                if(!avail_subvol) -                { -                        avail_subvol = dht_subvol_maxspace_nonzeroinode(this, -                                                                        subvol, -                                                                        layout); -                } -  	}  	UNLOCK (&conf->subvolume_lock);  out: @@ -325,7 +339,6 @@ out:  		gf_msg_debug (this->name, 0,  		              "No subvolume has enough free space \                                and/or inodes to create"); -                avail_subvol = subvol;  	}          if (layout) diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 8abf0d59b88..ac0f0e186fa 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this,          xlator_t     *subvol = NULL;          int           op_errno = -1;          dht_local_t  *local = NULL; +        dht_conf_t      *conf = NULL;          VALIDATE_OR_GOTO (frame, err);          VALIDATE_OR_GOTO (this, err);          VALIDATE_OR_GOTO (fd, err); +        conf = this->private; + +        if (conf->min_free_strict_mode == _gf_true) +                dht_get_du_info (frame, this, loc);          local = dht_local_init (frame, loc, fd, GF_FOP_OPEN);          if (!local) { @@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this,                                "no cached subvolume for fd=%p", fd);                  op_errno = EINVAL;                  goto err; +        } else if (conf->min_free_strict_mode == _gf_true && +                   dht_is_subvol_filled (this, subvol) == _gf_true && +                   flags & O_APPEND) { +                op_errno = ENOSPC; +                goto err;          }          local->rebalance.flags = flags; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 112685b659e..7420461da76 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          xlator_t     *subvol = NULL;          int           op_errno = -1;          dht_local_t  *local = NULL; +        loc_t        *nil_loc = {0,}; +        dht_conf_t   *conf = NULL;          VALIDATE_OR_GOTO (frame, err);          VALIDATE_OR_GOTO (this, err);          VALIDATE_OR_GOTO (fd, err); +        conf = this->private; + +          local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE);          if (!local) { @@ -173,15 +178,21 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,                  goto err;          } +        if (conf->min_free_strict_mode == _gf_true) +                dht_get_du_info (frame, this, nil_loc); +          subvol = local->cached_subvol;          if (!subvol) {                  gf_msg_debug (this->name, 0,                                "no cached subvolume for fd=%p", fd);                  op_errno = EINVAL;                  goto err; +        } else if (conf->min_free_strict_mode == _gf_true && +                   dht_is_subvol_filled (this, subvol) == _gf_true) { +                op_errno = ENOSPC; +                goto err;          } -          local->rebalance.vector = iov_dup (vector, count);          local->rebalance.offset = off;          local->rebalance.count = count; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 127996ecf61..ebc8a9c2492 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -20,7 +20,7 @@  #define GF_DISK_SECTOR_SIZE             512  #define DHT_REBALANCE_PID               4242 /* Change it if required */ -#define DHT_REBALANCE_BLKSIZE           (128 * 1024) +#define DHT_REBALANCE_BLKSIZE           (1024 * 1024)  /* 1 MB */  #define MAX_MIGRATE_QUEUE_COUNT         500  #define MIN_MIGRATE_QUEUE_COUNT         200 diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 5c810f0dc77..ccbf66b626d 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options)          conf->disk_unit = 0;          if (conf->min_free_disk < 100.0)                  conf->disk_unit = 'p'; +	GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode, +                          options, bool, out);  	GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,                            percent, out); @@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options,                            bool, out); + +        GF_OPTION_RECONF ("du-refresh-interval-sec", +                           conf->du_refresh_interval_sec, options, uint32, out);          ret = 0;  out:          return ret; @@ -720,7 +725,10 @@ dht_init (xlator_t *this)          GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);  	GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, -			err); +                        err); + +	GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode, +                        bool, err);          GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,                          err); @@ -738,6 +746,11 @@ dht_init (xlator_t *this)          GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled,                           bool, err); +        GF_OPTION_INIT ("du-refresh-interval-sec", +                        conf->du_refresh_interval_sec, uint32, err); + +        LOCK_INIT (&conf->du_refresh_lock); +          if (defrag) {                defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -907,6 +920,14 @@ struct volume_options options[] = {            "process starts balancing out the cluster, and logs will appear "            "in log files",          }, +        { .key  = {"min-free-strict-mode"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "When enabled, will reject in-flight writes or " +          "append operations to files when the target subvolume falls " +          "below min-free-(disk|inodes).  When disabled, these are allowed  " +          "through and only new files will be affected.", +        },  	{ .key  = {"min-free-inodes"},            .type = GF_OPTION_TYPE_PERCENT,            .default_value = "5%", @@ -1089,5 +1110,14 @@ struct volume_options options[] = {                           " associated with a file during rebalance"          }, +        { .key  = {"du-refresh-interval-sec"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = 0, +          .default_value = "60", +          .validate = GF_OPT_VALIDATE_MIN, +          .description = "Specifies how many seconds before subvolume statfs " +                         "info is re-validated." +        }, +          { .key  = {NULL} },  }; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 56e17d6e884..996faffa37f 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this,                                                          local);          } -        if (subvol != avail_subvol) { +        if (!avail_subvol) { +                op_errno = ENOSPC; +                goto err; +        } else if (subvol != avail_subvol) {                  /* create a link file instead of actual file */                  local->params = dict_ref (params);                  local->mode = mode; @@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,                                                          local);          } -        if (avail_subvol != subvol) { +        if (!avail_subvol) { +                op_errno = ENOSPC; +                goto err; +        } else if (avail_subvol != subvol) {                  /* Create linkfile first */                  local->params = dict_ref (params); diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index f1e9a399442..8b14ac99b8f 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this,                                                          local);          } -        if (subvol != avail_subvol) { +        if (!avail_subvol) { +                op_errno = ENOSPC; +                goto err; +        } else if (subvol != avail_subvol) {                  /* create a link file instead of actual file */                  local->mode = mode;                  local->flags = flags; @@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,                                                          local);          } -        if (avail_subvol != subvol) { +        if (!avail_subvol) { +                op_errno = ENOSPC; +                goto err; +        } else if (avail_subvol != subvol) {                  /* Create linkfile first */                  local->params = dict_ref (params); diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index c21417a0192..69f182c5194 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -91,9 +91,13 @@ typedef struct _ios_sample_t {          uid_t  uid;          gid_t  gid;          char   identifier[UNIX_PATH_MAX]; +        char   path[UNIX_PATH_MAX];          glusterfs_fop_t fop_type;          struct timeval timestamp;          double elapsed; +        gf_boolean_t have_path; +        int32_t op_ret; +        int32_t op_errno;  } ios_sample_t; @@ -178,10 +182,33 @@ typedef int (*block_dump_func) (xlator_t *, struct ios_dump_args*,                                      int , int , uint64_t ) ;  struct ios_local { -        struct timeval  wind_at; -        struct timeval  unwind_at; +        inode_t *inode; +        loc_t loc; +        fd_t *fd;  }; +static struct ios_local * +ios_local_new() { +        return GF_CALLOC (1, sizeof (struct ios_local), +                                gf_common_mt_char); +} + +static void +ios_local_free (struct ios_local *local) +{ +        if (!local) +                return; + +        inode_unref (local->inode); + +        if (local->fd) +                fd_unref (local->fd); + +        loc_wipe (&local->loc); +        memset (local, 0, sizeof (*local)); +        GF_FREE (local); +} +  struct volume_options options[];  static int @@ -192,6 +219,57 @@ is_fop_latency_started (call_frame_t *frame)          return memcmp (&frame->begin, &epoch, sizeof (epoch));  } +static void +ios_free_local (call_frame_t *frame) +{ +        struct ios_local *local = frame->local; + +        ios_local_free (local); + +        frame->local = NULL; +} + +static void +ios_track_loc (call_frame_t *frame, loc_t *loc) +{ +        struct ios_local *local = NULL; + +        if (loc && loc->path) { +                /* Check if frame->local is already set (it should +                 * only be set by either ios_track_loc() or +                 * ios_track_fd()). In other words, this check +                 * allows us to chain calls to ios_track_loc() +                 * and ios_track_fd() without clobbering frame->local +                 * in the process. +                 */ +                if (frame->local) { +                        local = frame->local; +                } else { +                        local = ios_local_new (); +                } +                loc_copy (&local->loc, loc); +                frame->local = local; +        } +} + +static void +ios_track_fd (call_frame_t *frame, fd_t *fd) +{ +        struct ios_local *local = NULL; + +        if (fd && fd->inode) { +                if (frame->local) { +                        local = frame->local; +                } else { +                        local = ios_local_new (); +                } +                local->fd = fd_ref (fd); +                local->inode = inode_ref (fd->inode); +                frame->local = local; +        } +} + +  #define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"  #ifdef GF_LINUX_HOST_OS  #define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats" @@ -206,7 +284,7 @@ is_fop_latency_started (call_frame_t *frame)                  conf = this->private;                                   \                  if (conf && conf->measure_latency) {                    \                          gettimeofday (&frame->end, NULL);               \ -                        update_ios_latency (conf, frame, GF_FOP_##op);  \ +                        update_ios_latency (conf, frame, GF_FOP_##op, 0, 0);  \                  }                                                       \          } while (0) @@ -244,7 +322,7 @@ is_fop_latency_started (call_frame_t *frame)  #define STATS_ADD(x,i)  (x) += (i)  #endif -#define UPDATE_PROFILE_STATS(frame, op)                                       \ +#define UPDATE_PROFILE_STATS(frame, op, op_ret, op_errno)                     \          do {                                                                  \                  struct ios_conf  *conf = NULL;                                \                                                                                \ @@ -257,7 +335,8 @@ is_fop_latency_started (call_frame_t *frame)                              conf->count_fop_hits) {                           \                                  BUMP_FOP(op);                                 \                                  gettimeofday (&frame->end, NULL);             \ -                                update_ios_latency (conf, frame, GF_FOP_##op);\ +                                update_ios_latency (conf, frame, GF_FOP_##op, \ +                                                        op_ret, op_errno);    \                          }                                                     \                  }                                                             \                  STATS_UNLOCK (&conf->lock);                                   \ @@ -694,7 +773,7 @@ ios_dump_throughput_stats (struct ios_stat_head *list_head, xlator_t *this,  int  _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) { -        char                  *key_root = "gluster"; +        char                  *key_root = "storage.gluster";          char                  *xlator_name = NULL;          char                  *instance_name = NULL;          size_t                key_len = 0; @@ -719,7 +798,7 @@ _io_stats_get_key_prefix (xlator_t *this, char **key_prefix) {          }          if (strcmp (__progname, "glusterfsd") == 0) -                key_root = "gluster.brick"; +                key_root = "storage.gluster.brick";          if (instance_name) {                  /* +3 for 2 x "." + NULL */ @@ -1010,7 +1089,10 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,          char   *port_pos = NULL;          char   *group_name = NULL;          char   *username = NULL; +        char   *path = NULL;          struct ios_conf *conf = NULL; +        const char *error_string = NULL; +        int32_t op_errno = 0;          conf = this->private; @@ -1057,12 +1139,22 @@ _io_stats_write_latency_sample (xlator_t *this, ios_sample_t *sample,                  sprintf (group_name, "%d", (int32_t)sample->gid);          } +        path = "Unknown"; +        if (sample->have_path) +                path = sample->path; + +        error_string = "No Error"; +        if (sample->op_ret != 0) { +                op_errno = abs (sample->op_errno); +                error_string = strerror (op_errno); +        } +          ios_log (this, logfp, -                 "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s", +                 "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s,%s,%d,%s",                   epoch_time, fop_enum_to_pri_string (sample->fop_type),                   fop_enum_to_string (sample->fop_type),                   sample->elapsed, xlator_name, instance_name, username, -                 group_name, hostname, port); +                 group_name, hostname, port, path, op_errno, error_string);          goto out;  err:          gf_log (this->name, GF_LOG_ERROR, @@ -1608,14 +1700,87 @@ io_stats_dump_fd (xlator_t *this, struct ios_fd *iosfd)          return 0;  } +void ios_local_get_inode (struct ios_local *local, inode_t **inode) +{ +        if (!local) +                return; + +        /* In the cases that a loc is given to us, +         * we should use that as the source of truth +         * for the inode. +         */ +        if (local->loc.inode) { +                *inode = local->loc.inode; +                return; +        } + +        /* Fall back to the inode in the local struct, +         * but there is no guarantee this will be a valid +         * pointer. +         */ +        *inode = local->inode; +} + +void ios_local_get_path (call_frame_t *frame, const char **path) +{ +        struct ios_stat  *iosstat  = NULL; +        struct ios_local *local    = NULL; +        inode_t          *inode    = NULL; + +        local = frame->local; +        if (!local) +                goto out; + +        ios_local_get_inode (local, &inode); + +        if (inode) { +                /* Each inode shold have an iosstat struct attached to it. +                 * This is the preferred way to retrieve the path. +                 */ +                ios_inode_ctx_get (inode, frame->this, &iosstat); +                if (iosstat) { +                        gf_log ("io-stats", GF_LOG_DEBUG, +                                "[%s] Getting path from iostat struct", +                                fop_enum_to_string (frame->op)); +                        *path = iosstat->filename; +                        goto out; +                } +        } + +        /* If we don't have the iosstat attached to the inode, +         * fall back to retrieving the path via the loc struct +         * inside the local. +         */ +        if (local->loc.path) { +                gf_log ("io-stats", GF_LOG_DEBUG, +                        "[%s] Getting path from loc_t", +                        fop_enum_to_string (frame->op)); +                *path = local->loc.path; +                goto out; +        } + +out: +        /* If the inode and the loc don't have the path, we're out of luck. +         */ +        if (!*path) { +                gf_log ("io-stats", GF_LOG_DEBUG, +                        "Unable to get path for fop: %s", +                        fop_enum_to_string (frame->op)); +        } + +        return; +} +  void collect_ios_latency_sample (struct ios_conf *conf,                  glusterfs_fop_t fop_type, double elapsed, -                call_frame_t *frame) +                call_frame_t *frame, int32_t op_ret, int32_t op_errno)  { +        struct ios_local *ios_local      = NULL;          ios_sample_buf_t *ios_sample_buf = NULL;          ios_sample_t     *ios_sample = NULL;          struct timeval   *timestamp = NULL;          call_stack_t     *root = NULL; +        const char       *path = NULL;          ios_sample_buf = conf->ios_sample_buf; @@ -1630,6 +1795,8 @@ void collect_ios_latency_sample (struct ios_conf *conf,          ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);          ios_sample->elapsed = elapsed;          ios_sample->fop_type = fop_type; +        ios_sample->op_ret = op_ret; +        ios_sample->op_errno = op_errno;          ios_sample->uid = root->uid;          ios_sample->gid = root->gid;          (ios_sample->timestamp).tv_sec = timestamp->tv_sec; @@ -1637,6 +1804,52 @@ void collect_ios_latency_sample (struct ios_conf *conf,          memcpy (&ios_sample->identifier, &root->identifier,                  sizeof (root->identifier)); +        /* Eventually every FOP will be supported +         * (i.e., the frame->local will be +         * of type struct ios_local), but for now, this is a safety. +         */ +        switch (ios_sample->fop_type) { + +        case GF_FOP_CREATE: +        case GF_FOP_OPEN: +        case GF_FOP_STAT: +        case GF_FOP_FSTAT: +        case GF_FOP_READ: +        case GF_FOP_WRITE: +        case GF_FOP_OPENDIR: +        case GF_FOP_READDIRP: +        case GF_FOP_READDIR: +        case GF_FOP_FLUSH: +        case GF_FOP_ACCESS: +        case GF_FOP_UNLINK: +        case GF_FOP_TRUNCATE: +        case GF_FOP_MKDIR: +        case GF_FOP_RMDIR: +        case GF_FOP_SETATTR: +        case GF_FOP_LOOKUP: +        case GF_FOP_INODELK: +        case GF_FOP_FINODELK: +        case GF_FOP_ENTRYLK: +        case GF_FOP_FXATTROP: +        case GF_FOP_XATTROP: +        case GF_FOP_GETXATTR: +        case GF_FOP_FGETXATTR: +        case GF_FOP_SETXATTR: +        case GF_FOP_FSETXATTR: +        case GF_FOP_STATFS: +        case GF_FOP_FSYNC: +                ios_local_get_path (frame, &path); +                break; +        default: +                path = NULL; +                break; +        } + +        if (path) { +                strncpy (ios_sample->path, path, sizeof (ios_sample->path)); +                ios_sample->have_path = _gf_true; +        } +          /* We've reached the end of the circular buffer, start from the           * beginning. */          if (ios_sample_buf->pos == (ios_sample_buf->size - 1)) @@ -1674,7 +1887,7 @@ update_ios_latency_stats (struct ios_global_stats   *stats, double elapsed,  int  update_ios_latency (struct ios_conf *conf, call_frame_t *frame, -                    glusterfs_fop_t op) +                    glusterfs_fop_t op, int32_t op_ret, int32_t op_errno)  {          double elapsed;          struct timeval *begin, *end; @@ -1687,7 +1900,7 @@ update_ios_latency (struct ios_conf *conf, call_frame_t *frame,          update_ios_latency_stats (&conf->cumulative, elapsed, op);          update_ios_latency_stats (&conf->incremental, elapsed, op); -        collect_ios_latency_sample (conf, op, elapsed, frame); +        collect_ios_latency_sample (conf, op, elapsed, frame, op_ret, op_errno);          return 0;  } @@ -1811,40 +2024,100 @@ unlock_list_head:          return ret;  } +static int +attach_iosstat_to_inode (xlator_t *this, inode_t *inode, const char *path, +                                const uuid_t gfid) { +        struct   ios_stat *iosstat = NULL; + +        if (!inode) { +                return -EINVAL; +        } + +        ios_inode_ctx_get (inode, this, &iosstat); +        if (!iosstat) { +                iosstat = GF_CALLOC (1, sizeof (*iosstat), +                                        gf_io_stats_mt_ios_stat); +                if (!iosstat) { +                        return -ENOMEM; +                } +                iosstat->filename = gf_strdup (path); +                gf_uuid_copy (iosstat->gfid, gfid); +                LOCK_INIT (&iosstat->lock); +                ios_inode_ctx_set (inode, this, iosstat); +        } + +        return 0; +} + + +int +ios_build_fd (xlator_t *this, const char *path, fd_t *fd, struct ios_fd **iosfd) +{ +        struct ios_fd *ifd = NULL; +        int            ret = 0; + +        ifd = GF_CALLOC (1, sizeof (*ifd), gf_io_stats_mt_ios_fd); +        if (!ifd) { +                ret = -ENOMEM; +                goto free_and_out; +        } + +        if (path) { +                ifd->filename = gf_strdup (path); +                if (!ifd->filename) { +                        ret = -ENOMEM; +                        goto free_and_out; +                } +        } + +        gettimeofday (&ifd->opened_at, NULL); + +        if (fd) +                ios_fd_ctx_set (fd, this, ifd); + +        *iosfd = ifd; + +        return ret; + +        /* Failure path */ +free_and_out: +        if (ifd) { +                GF_FREE (ifd->filename); +                GF_FREE (ifd); +        } + +        *iosfd = NULL; + +        return ret; +} + +  int  io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, fd_t *fd,                       inode_t *inode, struct iatt *buf,                       struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        struct ios_fd *iosfd = NULL; -        char          *path = NULL; -        struct ios_stat *iosstat = NULL; -        struct ios_conf   *conf = NULL; - -        conf = this->private; +        struct ios_local *local = NULL; +        struct ios_conf  *conf  = NULL; +        struct ios_fd    *iosfd = NULL; -        path = frame->local; -        frame->local = NULL; - -        if (!path) +        if (op_ret < 0) {                  goto unwind; +        } -        if (op_ret < 0) { -                GF_FREE (path); +        local = frame->local; +        if (!local) {                  goto unwind;          } -        iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); +        conf = this->private; + +        ios_build_fd (this, local->loc.path, fd, &iosfd);          if (!iosfd) { -                GF_FREE (path);                  goto unwind;          } -        iosfd->filename = path; -        gettimeofday (&iosfd->opened_at, NULL); - -        ios_fd_ctx_set (fd, this, iosfd);          LOCK (&conf->lock);          {                  conf->cumulative.nr_opens++; @@ -1855,18 +2128,12 @@ io_stats_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          }          UNLOCK (&conf->lock); -        iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); -        if (!iosstat) { -                GF_FREE (path); -                goto unwind; -        } -        iosstat->filename = gf_strdup (path); -        gf_uuid_copy (iosstat->gfid, buf->ia_gfid); -        LOCK_INIT (&iosstat->lock); -        ios_inode_ctx_set (fd->inode, this, iosstat); +        attach_iosstat_to_inode (this, local->loc.inode, local->loc.path, +                                        buf->ia_gfid);  unwind: -        UPDATE_PROFILE_STATS (frame, CREATE); +        UPDATE_PROFILE_STATS (frame, CREATE, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,                               preparent, postparent, xdata);          return 0; @@ -1877,44 +2144,24 @@ int  io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                     int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)  { -        struct ios_fd *iosfd = NULL; -        char          *path = NULL; -        struct   ios_stat *iosstat = NULL; -        struct ios_conf   *conf = NULL; - -        conf = this->private; -        path = frame->local; -        frame->local = NULL; - -        if (!path) -                goto unwind; +        struct ios_stat  *iosstat = NULL; +        struct ios_local *local   = NULL; +        struct ios_conf  *conf    = NULL; +        struct ios_fd    *iosfd   = NULL;          if (op_ret < 0) { -                GF_FREE (path);                  goto unwind;          } -        iosfd = GF_CALLOC (1, sizeof (*iosfd), gf_io_stats_mt_ios_fd); -        if (!iosfd) { -                GF_FREE (path); +        local = frame->local; +        if (!local) {                  goto unwind;          } -        iosfd->filename = path; -        gettimeofday (&iosfd->opened_at, NULL); - -        ios_fd_ctx_set (fd, this, iosfd); - -        ios_inode_ctx_get (fd->inode, this, &iosstat); -        if (!iosstat) { -                iosstat = GF_CALLOC (1, sizeof (*iosstat), -                                     gf_io_stats_mt_ios_stat); -                if (iosstat) { -                        iosstat->filename = gf_strdup (path); -                        gf_uuid_copy (iosstat->gfid, fd->inode->gfid); -                        LOCK_INIT (&iosstat->lock); -                        ios_inode_ctx_set (fd->inode, this, iosstat); -                } +        conf = this->private; +        ios_build_fd (this, local->loc.path, fd, &iosfd); +        if (!iosfd) { +                goto unwind;          }          LOCK (&conf->lock); @@ -1926,13 +2173,19 @@ io_stats_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  }          }          UNLOCK (&conf->lock); + +        ios_inode_ctx_get (fd->inode, this, &iosstat);          if (iosstat) {                BUMP_STATS (iosstat, IOS_STATS_TYPE_OPEN); -              iosstat = NULL;          } -unwind: -        UPDATE_PROFILE_STATS (frame, OPEN); +        attach_iosstat_to_inode (this, local->loc.inode, +                                        local->loc.path, +                                        local->loc.inode->gfid); + +unwind: +        UPDATE_PROFILE_STATS (frame, OPEN, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);          return 0; @@ -1943,7 +2196,8 @@ int  io_stats_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                     int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, STAT); +        UPDATE_PROFILE_STATS (frame, STAT, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);          return 0;  } @@ -1956,26 +2210,29 @@ io_stats_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      struct iatt *buf, struct iobref *iobref, dict_t *xdata)  {          int              len = 0; -        fd_t            *fd = NULL;          struct ios_stat *iosstat = NULL; +        struct ios_local *local = NULL; -        fd = frame->local; -        frame->local = NULL; +        local = frame->local; +        if (!local || !local->fd) +                goto unwind;          if (op_ret > 0) {                  len = iov_length (vector, count); -                BUMP_READ (fd, len); +                BUMP_READ (local->fd, len);          } -        UPDATE_PROFILE_STATS (frame, READ); -        ios_inode_ctx_get (fd->inode, this, &iosstat); +        UPDATE_PROFILE_STATS (frame, READ, op_ret, op_errno); +        ios_inode_ctx_get (local->fd->inode, this, &iosstat);          if (iosstat) { -              BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); -              BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); -              iosstat = NULL; +                BUMP_STATS (iosstat, IOS_STATS_TYPE_READ); +                BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_READ); +          } +unwind: +        ios_free_local (frame);          STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,                               vector, count, buf, iobref, xdata);          return 0; @@ -1989,21 +2246,23 @@ io_stats_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)  {          struct ios_stat *iosstat = NULL; +        struct ios_local *local = NULL;          inode_t         *inode   = NULL; -        UPDATE_PROFILE_STATS (frame, WRITE); -        if (frame->local){ -                inode = frame->local; -                frame->local = NULL; -                ios_inode_ctx_get (inode, this, &iosstat); -                if (iosstat) { -                        BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); -                        BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); -                        inode = NULL; -                        iosstat = NULL; -                } -        } +        local = frame->local; +        if (!local || !local->fd) +                goto unwind; + +        UPDATE_PROFILE_STATS (frame, WRITE, op_ret, op_errno); + +        ios_inode_ctx_get (local->inode, this, &iosstat); +        if (iosstat) { +                BUMP_STATS (iosstat, IOS_STATS_TYPE_WRITE); +                BUMP_THROUGHPUT (iosstat, IOS_STATS_THRU_WRITE); +        } +unwind: +        ios_free_local (frame);          STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);          return 0; @@ -2021,7 +2280,7 @@ io_stats_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          frame->local = NULL; -        UPDATE_PROFILE_STATS (frame, READDIRP); +        UPDATE_PROFILE_STATS (frame, READDIRP, op_ret, op_errno);          ios_inode_ctx_get (inode, this, &iosstat); @@ -2039,7 +2298,16 @@ int  io_stats_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, READDIR); +        struct ios_local *local  = NULL; +        struct ios_stat *iosstat = NULL; + +        local = frame->local; + +        UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno); + +        ios_free_local (frame); + +        UPDATE_PROFILE_STATS (frame, READDIR, op_ret, op_errno);          STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata);          return 0;  } @@ -2050,8 +2318,10 @@ io_stats_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      int32_t op_ret, int32_t op_errno,                      struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FSYNC); -        STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); +        UPDATE_PROFILE_STATS (frame, FSYNC, op_ret, op_errno); +        ios_free_local (frame); +        STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, +                                xdata);          return 0;  } @@ -2061,7 +2331,8 @@ io_stats_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno,                        struct iatt *preop, struct iatt *postop, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, SETATTR); +        UPDATE_PROFILE_STATS (frame, SETATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop, xdata);          return 0;  } @@ -2072,7 +2343,8 @@ io_stats_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno,                       struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, UNLINK); +        UPDATE_PROFILE_STATS (frame, UNLINK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,                               preparent, postparent, xdata);          return 0; @@ -2086,7 +2358,7 @@ io_stats_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       struct iatt *preoldparent, struct iatt *postoldparent,                       struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, RENAME); +        UPDATE_PROFILE_STATS (frame, RENAME, op_ret, op_errno);          STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,                               preoldparent, postoldparent,                               prenewparent, postnewparent, xdata); @@ -2099,7 +2371,8 @@ io_stats_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, const char *buf,                         struct iatt *sbuf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, READLINK); +        UPDATE_PROFILE_STATS (frame, READLINK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, sbuf, xdata);          return 0;  } @@ -2111,7 +2384,14 @@ io_stats_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       inode_t *inode, struct iatt *buf,                       dict_t *xdata, struct iatt *postparent)  { -        UPDATE_PROFILE_STATS (frame, LOOKUP); +        struct ios_local *local = frame->local; + +        if (local && local->loc.path && inode && op_ret >= 0) { +                attach_iosstat_to_inode (this, inode, local->loc.path, +                                                inode->gfid); +        } +        UPDATE_PROFILE_STATS (frame, LOOKUP, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,                               postparent);          return 0; @@ -2124,7 +2404,7 @@ io_stats_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        inode_t *inode, struct iatt *buf,                        struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, SYMLINK); +        UPDATE_PROFILE_STATS (frame, SYMLINK, op_ret, op_errno);          STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,                               preparent, postparent, xdata);          return 0; @@ -2137,7 +2417,7 @@ io_stats_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      inode_t *inode, struct iatt *buf,                      struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, MKNOD); +        UPDATE_PROFILE_STATS (frame, MKNOD, op_ret, op_errno);          STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,                               preparent, postparent, xdata);          return 0; @@ -2151,28 +2431,16 @@ io_stats_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      struct iatt *preparent, struct iatt *postparent,                      dict_t *xdata)  { -        struct ios_stat *iosstat = NULL; -        char   *path = frame->local; +        struct ios_local *local = frame->local; -        if (!path) -                goto unwind; - -        UPDATE_PROFILE_STATS (frame, MKDIR); -        if (op_ret < 0) -                goto unwind; - -        iosstat = GF_CALLOC (1, sizeof (*iosstat), gf_io_stats_mt_ios_stat); -        if (iosstat) { -                LOCK_INIT (&iosstat->lock); -                iosstat->filename = gf_strdup(path); -                gf_uuid_copy (iosstat->gfid, buf->ia_gfid); -                ios_inode_ctx_set (inode, this, iosstat); +        if (local && local->loc.path) { +                local->inode = inode_ref (inode); +                attach_iosstat_to_inode (this, inode, local->loc.path, +                                                buf->ia_gfid);          } -unwind: -        /* local is assigned with path */ -        GF_FREE (frame->local); -        frame->local = NULL; +        UPDATE_PROFILE_STATS (frame, MKDIR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,                               preparent, postparent, xdata);          return 0; @@ -2185,7 +2453,7 @@ io_stats_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                     inode_t *inode, struct iatt *buf,                     struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, LINK); +        UPDATE_PROFILE_STATS (frame, LINK, op_ret, op_errno);          STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,                               preparent, postparent, xdata);          return 0; @@ -2196,7 +2464,8 @@ int  io_stats_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FLUSH); +        UPDATE_PROFILE_STATS (frame, FLUSH, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2206,20 +2475,28 @@ int  io_stats_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)  { -        struct ios_stat *iosstat = NULL; -        int              ret     = -1; +        struct ios_local *local   = NULL; +        struct ios_stat  *iosstat = NULL; +        int               ret     = -1; + +        local = frame->local; +        if (!local || !local->fd) +                goto unwind; -        UPDATE_PROFILE_STATS (frame, OPENDIR);          if (op_ret < 0)                  goto unwind; -        ios_fd_ctx_set (fd, this, 0); +        attach_iosstat_to_inode (this, local->inode, local->loc.path, +                                        local->inode->gfid); -        ret = ios_inode_ctx_get (fd->inode, this, &iosstat); -        if (!ret) +        ios_fd_ctx_set (local->fd, this, 0); +        ios_inode_ctx_get (local->fd->inode, this, &iosstat); +        if (iosstat)                  BUMP_STATS (iosstat, IOS_STATS_TYPE_OPENDIR);  unwind: +        UPDATE_PROFILE_STATS (frame, OPENDIR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);          return 0;  } @@ -2231,8 +2508,8 @@ io_stats_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      struct iatt *preparent, struct iatt *postparent, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, RMDIR); - +        UPDATE_PROFILE_STATS (frame, RMDIR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,                               preparent, postparent, xdata);          return 0; @@ -2244,7 +2521,8 @@ io_stats_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno,                         struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, TRUNCATE); +        UPDATE_PROFILE_STATS (frame, TRUNCATE, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,                               prebuf, postbuf, xdata);          return 0; @@ -2255,7 +2533,8 @@ int  io_stats_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, STATFS); +        UPDATE_PROFILE_STATS (frame, STATFS, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);          return 0;  } @@ -2265,7 +2544,8 @@ int  io_stats_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, SETXATTR); +        UPDATE_PROFILE_STATS (frame, SETXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2275,7 +2555,8 @@ int  io_stats_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, GETXATTR); +        UPDATE_PROFILE_STATS (frame, GETXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);          return 0;  } @@ -2285,7 +2566,8 @@ int  io_stats_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                            int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, REMOVEXATTR); +        UPDATE_PROFILE_STATS (frame, REMOVEXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2294,7 +2576,8 @@ int  io_stats_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FSETXATTR); +        UPDATE_PROFILE_STATS (frame, FSETXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2304,7 +2587,8 @@ int  io_stats_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FGETXATTR); +        UPDATE_PROFILE_STATS (frame, FGETXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);          return 0;  } @@ -2314,7 +2598,8 @@ int  io_stats_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                             int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FREMOVEXATTR); +        UPDATE_PROFILE_STATS (frame, FREMOVEXATTR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2324,7 +2609,8 @@ int  io_stats_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FSYNCDIR); +        UPDATE_PROFILE_STATS (frame, FSYNCDIR, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2334,7 +2620,20 @@ int  io_stats_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, ACCESS); +        struct ios_local *local = frame->local; + +        /* ACCESS is called before a READ when a fop fails over +         * in NFS. We need to make sure that we are attaching the +         * data correctly to this inode. +         */ +        if (local->loc.inode && local->loc.path) { +                attach_iosstat_to_inode (this, local->loc.inode, +                                                local->loc.path, +                                                local->loc.inode->gfid); +        } + +        UPDATE_PROFILE_STATS (frame, ACCESS, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2345,7 +2644,8 @@ io_stats_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno,                          struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FTRUNCATE); +        UPDATE_PROFILE_STATS (frame, FTRUNCATE, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno,                               prebuf, postbuf, xdata);          return 0; @@ -2356,7 +2656,8 @@ int  io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FSTAT); +        UPDATE_PROFILE_STATS (frame, FSTAT, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);          return 0;  } @@ -2367,8 +2668,9 @@ io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,  		       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,  		       struct iatt *postbuf, dict_t *xdata)  { -	UPDATE_PROFILE_STATS(frame, FALLOCATE); -	STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, +	UPDATE_PROFILE_STATS (frame, FALLOCATE, op_ret, op_errno); +	ios_free_local (frame); +        STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,  			    xdata);  	return 0;  } @@ -2379,8 +2681,9 @@ io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,  		     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,  		     struct iatt *postbuf, dict_t *xdata)  { -	UPDATE_PROFILE_STATS(frame, DISCARD); -	STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, +	UPDATE_PROFILE_STATS (frame, DISCARD, op_ret, op_errno); +	ios_free_local (frame); +        STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,  			    xdata);  	return 0;  } @@ -2390,7 +2693,8 @@ io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                       struct iatt *postbuf, dict_t *xdata)  { -        UPDATE_PROFILE_STATS(frame, ZEROFILL); +        UPDATE_PROFILE_STATS (frame, ZEROFILL, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,                              xdata);          return 0; @@ -2400,7 +2704,8 @@ int  io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                   int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, LK); +        UPDATE_PROFILE_STATS (frame, LK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);          return 0;  } @@ -2410,7 +2715,8 @@ int  io_stats_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, ENTRYLK); +        UPDATE_PROFILE_STATS (frame, ENTRYLK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2420,7 +2726,8 @@ int  io_stats_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, XATTROP); +        UPDATE_PROFILE_STATS (frame, XATTROP, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata);          return 0;  } @@ -2430,7 +2737,8 @@ int  io_stats_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, FXATTROP); +        UPDATE_PROFILE_STATS (frame, FXATTROP, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata);          return 0;  } @@ -2440,7 +2748,8 @@ int  io_stats_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        UPDATE_PROFILE_STATS (frame, INODELK); +        UPDATE_PROFILE_STATS (frame, INODELK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2450,6 +2759,8 @@ io_stats_entrylk (call_frame_t *frame, xlator_t *this,                    const char *volume, loc_t *loc, const char *basename,                    entrylk_cmd cmd, entrylk_type type, dict_t *xdata)  { +        ios_track_loc (frame, loc); +          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_entrylk_cbk, @@ -2464,6 +2775,7 @@ int  io_stats_inodelk (call_frame_t *frame, xlator_t *this,                    const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame); @@ -2479,8 +2791,8 @@ int  io_stats_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *xdata)  { - -        UPDATE_PROFILE_STATS (frame, FINODELK); +        UPDATE_PROFILE_STATS (frame, FINODELK, op_ret, op_errno); +        ios_free_local (frame);          STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);          return 0;  } @@ -2490,6 +2802,7 @@ int  io_stats_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,                     fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_finodelk_cbk, @@ -2504,6 +2817,7 @@ int  io_stats_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,                    gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_xattrop_cbk, @@ -2518,6 +2832,7 @@ int  io_stats_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,                     gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fxattrop_cbk, @@ -2532,6 +2847,7 @@ int  io_stats_lookup (call_frame_t *frame, xlator_t *this,                   loc_t *loc, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_lookup_cbk, @@ -2545,6 +2861,7 @@ io_stats_lookup (call_frame_t *frame, xlator_t *this,  int  io_stats_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_stat_cbk, @@ -2559,6 +2876,7 @@ int  io_stats_readlink (call_frame_t *frame, xlator_t *this,                     loc_t *loc, size_t size, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_readlink_cbk, @@ -2573,6 +2891,7 @@ int  io_stats_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,                  mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_mknod_cbk, @@ -2587,9 +2906,7 @@ int  io_stats_mkdir (call_frame_t *frame, xlator_t *this,                  loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)  { -        if (loc->path) -                frame->local = gf_strdup (loc->path); - +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_mkdir_cbk, @@ -2604,6 +2921,7 @@ int  io_stats_unlink (call_frame_t *frame, xlator_t *this,                   loc_t *loc, int xflag, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_unlink_cbk, @@ -2618,6 +2936,7 @@ int  io_stats_rmdir (call_frame_t *frame, xlator_t *this,                  loc_t *loc, int flags, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_rmdir_cbk, @@ -2674,6 +2993,7 @@ int  io_stats_setattr (call_frame_t *frame, xlator_t *this,                    loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_setattr_cbk, @@ -2688,6 +3008,7 @@ int  io_stats_truncate (call_frame_t *frame, xlator_t *this,                     loc_t *loc, off_t offset, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_truncate_cbk, @@ -2702,8 +3023,8 @@ int  io_stats_open (call_frame_t *frame, xlator_t *this, loc_t *loc,                 int32_t flags, fd_t *fd, dict_t *xdata)  { -        if (loc->path) -                frame->local = gf_strdup (loc->path); +        ios_track_loc (frame, loc); +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame); @@ -2719,9 +3040,10 @@ int  io_stats_create (call_frame_t *frame, xlator_t *this,                   loc_t *loc, int32_t flags, mode_t mode,                   mode_t umask, fd_t *fd, dict_t *xdata) +  { -        if (loc->path) -                frame->local = gf_strdup (loc->path); +        ios_track_loc (frame, loc); +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame); @@ -2737,8 +3059,7 @@ int  io_stats_readv (call_frame_t *frame, xlator_t *this,                  fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)  { -        frame->local = fd; - +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_readv_cbk, @@ -2756,9 +3077,12 @@ io_stats_writev (call_frame_t *frame, xlator_t *this,                   uint32_t flags, struct iobref *iobref, dict_t *xdata)  {          int                 len = 0; +        struct ios_conf     *conf = NULL; +        struct ios_local    *local  = NULL; +        int                 ret = 0; + +        ios_track_fd (frame, fd); -        if (fd->inode) -                frame->local = fd->inode;          len = iov_length (vector, count);          BUMP_WRITE (fd, len); @@ -2777,6 +3101,7 @@ int  io_stats_statfs (call_frame_t *frame, xlator_t *this,                   loc_t *loc, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_statfs_cbk, @@ -2791,6 +3116,7 @@ int  io_stats_flush (call_frame_t *frame, xlator_t *this,                  fd_t *fd, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_flush_cbk, @@ -2805,6 +3131,7 @@ int  io_stats_fsync (call_frame_t *frame, xlator_t *this,                  fd_t *fd, int32_t flags, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fsync_cbk, @@ -2971,7 +3298,7 @@ _ios_dump_thread (xlator_t *this) {                                  stats_filename, strerror(errno));                          log_stats_fopen_failure = _gf_false;                  } -                samples_logfp = fopen (samples_filename, "w+"); +                samples_logfp = fopen (samples_filename, "a");                  if (samples_logfp) {                          io_stats_dump_latency_samples_logfp (this,                                                               samples_logfp); @@ -3024,6 +3351,8 @@ io_stats_setxattr (call_frame_t *frame, xlator_t *this,                  goto out;          } +        ios_track_loc (frame, loc); +          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_setxattr_cbk, @@ -3042,6 +3371,7 @@ int  io_stats_getxattr (call_frame_t *frame, xlator_t *this,                     loc_t *loc, const char *name, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_getxattr_cbk, @@ -3056,6 +3386,7 @@ int  io_stats_removexattr (call_frame_t *frame, xlator_t *this,                        loc_t *loc, const char *name, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_removexattr_cbk, @@ -3071,6 +3402,7 @@ io_stats_fsetxattr (call_frame_t *frame, xlator_t *this,                      fd_t *fd, dict_t *dict,                      int32_t flags, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fsetxattr_cbk, @@ -3085,6 +3417,7 @@ int  io_stats_fgetxattr (call_frame_t *frame, xlator_t *this,                      fd_t *fd, const char *name, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fgetxattr_cbk, @@ -3099,6 +3432,7 @@ int  io_stats_fremovexattr (call_frame_t *frame, xlator_t *this,                         fd_t *fd, const char *name, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fremovexattr_cbk, @@ -3170,6 +3504,7 @@ int  io_stats_access (call_frame_t *frame, xlator_t *this,                   loc_t *loc, int32_t mask, dict_t *xdata)  { +        ios_track_loc (frame, loc);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_access_cbk, @@ -3212,6 +3547,7 @@ int  io_stats_fstat (call_frame_t *frame, xlator_t *this,                  fd_t *fd, dict_t *xdata)  { +        ios_track_fd (frame, fd);          START_FOP_LATENCY (frame);          STACK_WIND (frame, io_stats_fstat_cbk, diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c index 270632bc71b..2eb3a9f9149 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc,          case RPC_CLNT_DISCONNECT:          case RPC_CLNT_MSG:          case RPC_CLNT_DESTROY: +        case RPC_CLNT_PING:                  break;          } diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c index 77637c7beec..459d173db7f 100644 --- a/xlators/features/changelog/src/changelog-ev-handle.c +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -180,6 +180,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc,                  /* Free up mydata */                  changelog_rpc_clnt_unref (crpc);                  break; +        case RPC_CLNT_PING: +                break;          }          return 0; diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 640c6bb5553..d7c210f24a5 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -234,6 +234,7 @@ blkd:                                  continue;                          bcount++; +                        list_del_init (&ilock->client_list);                          list_del_init (&ilock->blocked_locks);                          list_add (&ilock->blocked_locks, &released);                  } @@ -268,6 +269,7 @@ granted:                                  continue;                          gcount++; +                        list_del_init (&ilock->client_list);                          list_del_init (&ilock->list);                          list_add (&ilock->list, &released);                  } @@ -321,6 +323,7 @@ blkd:                          bcount++; +                        list_del_init (&elock->client_list);                          list_del_init (&elock->blocked_locks);                          list_add_tail (&elock->blocked_locks, &released);                  } @@ -355,6 +358,7 @@ granted:                          }                          gcount++; +                        list_del_init (&elock->client_list);                          list_del_init (&elock->domain_list);                          list_add_tail (&elock->domain_list, &removed); diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index d56a7aca2be..c40c29de63a 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -1116,3 +1116,16 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)          return conf;  } + +gf_boolean_t +pl_does_monkey_want_stuck_lock() +{ +        long int          monkey_unlock_rand = 0; +        long int          monkey_unlock_rand_rem = 0; + +        monkey_unlock_rand = random (); +        monkey_unlock_rand_rem = monkey_unlock_rand % 100; +        if (monkey_unlock_rand_rem == 0) +                return _gf_true; +        return _gf_false; +} diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 5486f9b8314..3729ca24bed 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -161,4 +161,7 @@ pl_metalock_is_active (pl_inode_t *pl_inode);  int  __pl_queue_lock (pl_inode_t *pl_inode, posix_lock_t *reqlock, int can_block); + +gf_boolean_t +pl_does_monkey_want_stuck_lock();  #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index 783c57e6381..4231d760cdc 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -16,9 +16,9 @@  #include "list.h"  #include "locks.h" +#include "clear.h"  #include "common.h" -  void  __pl_entrylk_unref (pl_entry_lock_t *lock)  { @@ -111,6 +111,97 @@ __conflicting_entrylks (pl_entry_lock_t *l1, pl_entry_lock_t *l2)          return 0;  } +/* See comments in inodelk.c for details */ +static inline gf_boolean_t +__stale_entrylk (xlator_t *this, pl_entry_lock_t *candidate_lock, +                pl_entry_lock_t *requested_lock, time_t *lock_age_sec) +{ +        posix_locks_private_t  *priv = NULL; +        struct timeval curr; +        gettimeofday (&curr, NULL); + +        priv = this->private; + +        /* Question: Should we just prune them all given the +         * chance?  Or just the locks we are attempting to acquire? +         */ +        if (names_conflict (candidate_lock->basename, +                        requested_lock->basename)) { +                *lock_age_sec = curr.tv_sec - +                        candidate_lock->granted_time.tv_sec; +                if (*lock_age_sec > priv->revocation_secs) +                        return _gf_true; +        } +        return _gf_false; +} + +/* See comments in inodelk.c for details */ +static gf_boolean_t +__entrylk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, +                 pl_entry_lock_t *lock) +{ +        posix_locks_private_t  *priv = NULL; +        pl_entry_lock_t *tmp = NULL; +        pl_entry_lock_t *lk = NULL; +        gf_boolean_t revoke_lock = _gf_false; +        int bcount = 0; +        int gcount = 0; +        int op_errno = 0; +        clrlk_args args; +        args.opts = NULL; +        time_t lk_age_sec = 0; +        uint32_t max_blocked = 0; +        char *reason_str = NULL; + +        priv = this->private; +        args.type = CLRLK_ENTRY; +        if (priv->revocation_clear_all == _gf_true) +                args.kind = CLRLK_ALL; +        else +                args.kind = CLRLK_GRANTED; + + +        if (list_empty (&dom->entrylk_list)) +                goto out; + +        pthread_mutex_lock (&pinode->mutex); +        lock->pinode = pinode; +        list_for_each_entry_safe (lk, tmp, &dom->entrylk_list, domain_list) { +                if (__stale_entrylk (this, lk, lock, &lk_age_sec) == _gf_true) { +                        revoke_lock = _gf_true; +                        reason_str = "age"; +                        break; +                } +        } +        max_blocked = priv->revocation_max_blocked; +        if (max_blocked != 0 && revoke_lock == _gf_false) { +                list_for_each_entry_safe (lk, tmp, &dom->blocked_entrylks, +                                blocked_locks) { +                        max_blocked--; +                        if (max_blocked == 0) { +                                revoke_lock = _gf_true; +                                reason_str = "max blocked"; +                                break; +                        } +                } +        } +        pthread_mutex_unlock (&pinode->mutex); + +out: +        if (revoke_lock == _gf_true) { +                clrlk_clear_entrylk (this, pinode, dom, &args, &bcount, &gcount, +                    &op_errno); +                gf_log (this->name, GF_LOG_WARNING, +                        "Lock revocation [reason: %s; gfid: %s; domain: %s; " +                        "age: %ld sec] - Entry lock revoked:  %d granted & %d " +                        "blocked locks cleared", reason_str, +                        uuid_utoa (pinode->gfid), dom->domain, lk_age_sec, +                        gcount, bcount); +        } + +        return revoke_lock; +} +  /**   * entrylk_grantable - is this lock grantable?   * @inode: inode in which to look @@ -546,6 +637,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,          pl_ctx_t        *ctx              =  NULL;  	int              nonblock         =  0;          gf_boolean_t     need_inode_unref =  _gf_false; +        posix_locks_private_t  *priv = NULL; + +        priv = this->private;          if (xdata)                  dict_ret = dict_get_str (xdata, "connection-id", &conn_id); @@ -599,6 +693,24 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,           * current stack unwinds.           */          pinode->inode = inode_ref (inode); +        if (priv->revocation_secs != 0) { +                if (cmd != ENTRYLK_UNLOCK) { +                        __entrylk_prune_stale (this, pinode, dom, reqlock); +                } else if (priv->monkey_unlocking == _gf_true) { +                        if (pl_does_monkey_want_stuck_lock ()) { +                                gf_log (this->name, GF_LOG_WARNING, +                                    "MONKEY LOCKING (forcing stuck lock)!"); +                                op_ret = 0; +                                need_inode_unref = _gf_true; +                                pthread_mutex_lock (&pinode->mutex); +                                { +                                        __pl_entrylk_unref (reqlock); +                                } +                                pthread_mutex_unlock (&pinode->mutex); +                                goto out; +                        } +                } +        }          switch (cmd) {          case ENTRYLK_LOCK_NB: @@ -678,9 +790,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,                          "a bug report at http://bugs.gluster.com", cmd);                  goto out;          } -        if (need_inode_unref) -                inode_unref (pinode->inode); -          /* The following (extra) unref corresponds to the ref that           * was done at the time the lock was granted.           */ @@ -689,6 +798,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,  out: +        if (need_inode_unref) +                inode_unref (pinode->inode); +          if (unwind) {                  entrylk_trace_out (this, frame, volume, fd, loc, basename,                                     cmd, type, op_ret, op_errno); @@ -772,8 +884,6 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)          {                  list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,  					  client_list) { -                        list_del_init (&l->client_list); -  			pl_entrylk_log_cleanup (l);  			pinode = l->pinode; @@ -810,6 +920,8 @@ pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)                           *      blocked to avoid leaving L1 to starve forever.                           * iv.  unref the object.                           */ +                                list_del_init (&l->client_list); +                                  if (!list_empty (&l->domain_list)) {                                          list_del_init (&l->domain_list);                                          list_add_tail (&l->client_list, diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 1564f26b8fb..e1702c78ba1 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -16,6 +16,7 @@  #include "list.h"  #include "locks.h" +#include "clear.h"  #include "common.h"  void @@ -130,6 +131,105 @@ inodelk_conflict (pl_inode_lock_t *l1, pl_inode_lock_t *l2)                  inodelk_type_conflict (l1, l2));  } +/* + * Check to see if the candidate lock overlaps/conflicts with the + * requested lock.  If so, determine how old the lock is and return + * true if it exceeds the configured threshold, false otherwise. + */ +static inline gf_boolean_t +__stale_inodelk (xlator_t *this, pl_inode_lock_t *candidate_lock, +                pl_inode_lock_t *requested_lock, time_t *lock_age_sec) +{ +        posix_locks_private_t  *priv = NULL; +        struct timeval curr; + +        priv = this->private; +        gettimeofday (&curr, NULL); +        /* Question: Should we just prune them all given the +         * chance?  Or just the locks we are attempting to acquire? +         */ +        if (inodelk_conflict (candidate_lock, requested_lock)) { +                *lock_age_sec = curr.tv_sec - +                        candidate_lock->granted_time.tv_sec; +                if (*lock_age_sec > priv->revocation_secs) +                        return _gf_true; +        } +        return _gf_false; +} + +/* Examine any locks held on this inode and potentially revoke the lock + * if the age exceeds revocation_secs.  We will clear _only_ those locks + * which are granted, and then grant those locks which are blocked. + * + * Depending on how this patch works in the wild, we may expand this and + * introduce a heuristic which clears blocked locks as well if they + * are beyond a threshold. + */ +static gf_boolean_t +__inodelk_prune_stale (xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom, +                       pl_inode_lock_t *lock) +{ +        posix_locks_private_t  *priv = NULL; +        pl_inode_lock_t *tmp = NULL; +        pl_inode_lock_t *lk = NULL; +        gf_boolean_t revoke_lock = _gf_false; +        int bcount = 0; +        int gcount = 0; +        int op_errno = 0; +        clrlk_args args; +        args.opts = NULL; +        time_t lk_age_sec = 0; +        uint32_t max_blocked = 0; +        char *reason_str = NULL; + +        priv = this->private; + +        args.type = CLRLK_INODE; +        if (priv->revocation_clear_all == _gf_true) +                args.kind = CLRLK_ALL; +        else +                args.kind = CLRLK_GRANTED; + +        if (list_empty (&dom->inodelk_list)) +                goto out; + +        pthread_mutex_lock (&pinode->mutex); +        list_for_each_entry_safe (lk, tmp, &dom->inodelk_list, list) { +                if (__stale_inodelk (this, lk, lock, &lk_age_sec) == _gf_true) { +                        revoke_lock = _gf_true; +                        reason_str = "age"; +                        break; +                } +        } + +        max_blocked = priv->revocation_max_blocked; +        if (max_blocked != 0 && revoke_lock == _gf_false) { +                list_for_each_entry_safe (lk, tmp, &dom->blocked_inodelks, +                                blocked_locks) { +                        max_blocked--; +                        if (max_blocked == 0) { +                                revoke_lock = _gf_true; +                                reason_str = "max blocked"; +                                break; +                        } +                } +        } +        pthread_mutex_unlock (&pinode->mutex); + +out: +        if (revoke_lock == _gf_true) { +                clrlk_clear_inodelk (this, pinode, dom, &args, &bcount, &gcount, +                        &op_errno); +                gf_log (this->name, GF_LOG_WARNING, +                        "Lock revocation [reason: %s; gfid: %s; domain: %s; " +                        "age: %ld sec] - Inode lock revoked:  %d granted & %d " +                        "blocked locks cleared", +                        reason_str, uuid_utoa (pinode->gfid), dom->domain, +                        lk_age_sec, gcount, bcount); +        } +        return revoke_lock; +} +  /* Determine if lock is grantable or not */  static pl_inode_lock_t *  __inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) @@ -419,8 +519,6 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)          {                  list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,  					  client_list) { -                        list_del_init (&l->client_list); -  			pl_inodelk_log_cleanup (l);  			pl_inode = l->pl_inode; @@ -458,6 +556,8 @@ pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)                           *      forever.                           * iv.  unref the object.                           */ +                                list_del_init (&l->client_list); +                                  if (!list_empty (&l->list)) {                                          __delete_inode_lock (l);                                          list_add_tail (&l->client_list, @@ -509,6 +609,7 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,  		pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,                  inode_t *inode)  { +        posix_locks_private_t  *priv = NULL;          int               ret              = -EINVAL;          pl_inode_lock_t  *retlock          =  NULL;          gf_boolean_t      unref            =  _gf_true; @@ -518,6 +619,8 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,  	lock->pl_inode = pl_inode;          fl_type = lock->fl_type; +        priv = this->private; +          /* Ideally, AFTER a successful lock (both blocking and non-blocking) or           * an unsuccessful blocking lock operation, the inode needs to be ref'd.           * @@ -537,6 +640,24 @@ pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,           */          pl_inode->inode = inode_ref (inode); +        if (priv->revocation_secs != 0) { +                if (lock->fl_type != F_UNLCK) { +                        __inodelk_prune_stale (this, pl_inode, dom, lock); +                } else if (priv->monkey_unlocking == _gf_true) { +                        if (pl_does_monkey_want_stuck_lock ()) { +                                pthread_mutex_lock (&pl_inode->mutex); +                                { +                                        __pl_inodelk_unref (lock); +                                } +                                pthread_mutex_unlock (&pl_inode->mutex); +                                inode_unref (pl_inode->inode); +                                gf_log (this->name, GF_LOG_WARNING, +                                    "MONKEY LOCKING (forcing stuck lock)!"); +                                return 0; +                        } +                } +        } +  	if (ctx)  		pthread_mutex_lock (&ctx->lock);          pthread_mutex_lock (&pl_inode->mutex); diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index e363f425b65..8eb35da44be 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -190,6 +190,10 @@ typedef struct {          mlk_mode_t      mandatory_mode; /* holds current mandatory locking mode */          gf_boolean_t    trace;          /* trace lock requests in and out */          char           *brickname; +        gf_boolean_t    monkey_unlocking; +        uint32_t        revocation_secs; +        gf_boolean_t    revocation_clear_all; +        uint32_t        revocation_max_blocked;  } posix_locks_private_t; diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 3415d59324c..7f85ba4fca5 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -3629,7 +3629,21 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("trace", priv->trace, options, bool, out); +        GF_OPTION_RECONF ("monkey-unlocking", priv->monkey_unlocking, options, +                          bool, out); + +        GF_OPTION_RECONF ("revocation-secs", +                          priv->revocation_secs, options, +                          uint32, out); + +        GF_OPTION_RECONF ("revocation-clear-all", priv->revocation_clear_all, +                          options, bool, out); + +        GF_OPTION_RECONF ("revocation-max-blocked", +                          priv->revocation_max_blocked, options, +                          uint32, out);          ret = 0; +  out:          return ret;  } @@ -3680,6 +3694,18 @@ init (xlator_t *this)          GF_OPTION_INIT ("trace", priv->trace, bool, out); +        GF_OPTION_INIT ("monkey-unlocking", priv->monkey_unlocking, +                        bool, out); + +        GF_OPTION_INIT ("revocation-secs", priv->revocation_secs, +                        uint32, out); + +        GF_OPTION_INIT ("revocation-clear-all", priv->revocation_clear_all, +                        bool, out); + +        GF_OPTION_INIT ("revocation-max-blocked", priv->revocation_max_blocked, +                        uint32, out); +          this->local_pool = mem_pool_new (pl_local_t, 32);          if (!this->local_pool) {                  ret = -1; @@ -3936,5 +3962,35 @@ struct volume_options options[] = {            .description = "Trace the different lock requests "                           "to logs."          }, +        { .key  = { "monkey-unlocking" }, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "false", +          .description = "Ignore a random number of unlock requests.  Useful " +                         "for testing/creating robust lock recovery mechanisms." +        }, +        { .key = {"revocation-secs"}, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, +          .max = INT_MAX, +          .default_value = "0", +          .description = "Maximum time a lock can be taken out, before" +                         "being revoked.", +        }, +        { .key = {"revocation-clear-all"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "false", +          .description = "If set to true, will revoke BOTH granted and blocked " +                         "(pending) lock requests if a revocation threshold is " +                         "hit.", +        }, +        { .key = {"revocation-max-blocked"}, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, +          .max = INT_MAX, +          .default_value = "0", +          .description = "A number of blocked lock requests after which a lock " +                         "will be revoked to allow the others to proceed.  Can " +                         "be used in conjunction w/ revocation-clear-all." +        },          { .key = {NULL} },  }; diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c index fc2ff2ab10d..f5062971bf4 100644 --- a/xlators/features/snapview-server/src/snapview-server-mgmt.c +++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c @@ -73,7 +73,7 @@ svs_mgmt_init (xlator_t *this)          if (cmd_args->volfile_server)                  host = cmd_args->volfile_server; -        ret = rpc_transport_inet_options_build (&options, host, port); +        ret = rpc_transport_inet_options_build (&options, host, port, NULL);          if (ret) {                  gf_log (this->name, GF_LOG_ERROR, "failed to build the "                          "transport options"); diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index bf62290d023..3c21b9755ea 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -3418,7 +3418,7 @@ glusterd_transport_keepalive_options_get (int *interval, int *time,  int  glusterd_transport_inet_options_build (dict_t **options, const char *hostname, -                                       int port) +                                       int port, char *addr_family)  {          dict_t  *dict = NULL;          int32_t interval = -1; @@ -3433,7 +3433,8 @@ glusterd_transport_inet_options_build (dict_t **options, const char *hostname,                  port = GLUSTERD_DEFAULT_PORT;          /* Build default transport options */ -        ret = rpc_transport_inet_options_build (&dict, hostname, port); +        ret = rpc_transport_inet_options_build (&dict, hostname, port, +                                                addr_family);          if (ret)                  goto out; @@ -3470,6 +3471,7 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,          int                       ret     = -1;          glusterd_peerctx_t       *peerctx = NULL;          data_t                   *data    = NULL; +        char                     *addr_family = NULL;          peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t);          if (!peerctx) @@ -3485,9 +3487,15 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,                                                           uniquely identify a                                                           peerinfo */ +        if (dict_get_str(this->options, "transport.address-family", +                         &addr_family)) { +                addr_family = NULL; +        } +          ret = glusterd_transport_inet_options_build (&options,                                                       peerinfo->hostname, -                                                     peerinfo->port); +                                                     peerinfo->port, +                                                     addr_family);          if (ret)                  goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c index 0ea66a027bf..4fdff3402f5 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handshake.c +++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c @@ -241,6 +241,50 @@ build_volfile_path (char *volume_id, char *path,          } +        volid_ptr = strstr (volume_id, "gfproxy-client/"); +        if (volid_ptr) { +                volid_ptr = strchr (volid_ptr, '/'); +                if (!volid_ptr) { +                        ret = -1; +                        goto out; +                } +                volid_ptr++; + +                ret = glusterd_volinfo_find (volid_ptr, &volinfo); +                if (ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Couldn't find volinfo"); +                        goto out; +                } + +                glusterd_get_gfproxy_client_volfile (volinfo, path, path_len); + +                ret = 0; +                goto out; +        } + +        volid_ptr = strstr (volume_id, "gfproxy/"); +        if (volid_ptr) { +                volid_ptr = strchr (volid_ptr, '/'); +                if (!volid_ptr) { +                        ret = -1; +                        goto out; +                } +                volid_ptr++; + +                ret = glusterd_volinfo_find (volid_ptr, &volinfo); +                if (ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Couldn't find volinfo"); +                        goto out; +                } + +                glusterd_get_gfproxyd_volfile (volinfo, path, path_len); + +                ret = 0; +                goto out; +        } +          volid_ptr = strstr (volume_id, "/snaps/");          if (volid_ptr) {                  ret = get_snap_volname_and_volinfo (volid_ptr, &volname, diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index e7ae9b7848d..bd394fc31ba 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -1791,6 +1791,7 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t  *volinfo,          int                     port = 0;          int                     rdma_port = 0;          char                    *bind_address = NULL; +        char                    *addr_family = NULL;          char                    socketpath[PATH_MAX] = {0};          char                    glusterd_uuid[1024] = {0,};          char                    valgrind_logfile[PATH_MAX] = {0}; @@ -1913,6 +1914,13 @@ retry:                                    bind_address);          } +        if (dict_get_str (this->options, "transport.address-family", +                          &addr_family) == 0) { +                runner_add_arg (&runner, "--xlator-option"); +                runner_argprintf (&runner, "*.transport.address-family=%s", +                                  addr_family); +        } +          if (volinfo->transport_type == GF_TRANSPORT_RDMA)                  runner_argprintf (&runner, "--volfile-server-transport=rdma");          else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA) @@ -10796,6 +10804,45 @@ out:  }  void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, +                                        char *path, int path_len) +{ +        char                    workdir[PATH_MAX]      = {0, }; +        glusterd_conf_t        *priv                    = THIS->private; + +        GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + +        switch (volinfo->transport_type) { +        case GF_TRANSPORT_TCP: +                snprintf (path, path_len, +                                "%s/trusted-%s.tcp-gfproxy-fuse.vol", +                                workdir, volinfo->volname); +                break; + +        case GF_TRANSPORT_RDMA: +                snprintf (path, path_len, +                                "%s/trusted-%s.rdma-gfproxy-fuse.vol", +                                workdir, volinfo->volname); +                break; +        default: +                break; +        } +} + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, +                                char *path, int path_len) +{ +        char                    workdir[PATH_MAX]      = {0, }; +        glusterd_conf_t        *priv                    = THIS->private; + +        GLUSTERD_GET_VOLUME_DIR (workdir, volinfo, priv); + +        snprintf (path, path_len, "%s/%s.gfproxyd.vol", workdir, +                  volinfo->volname); +} + +void  glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,                                  char *path, int path_len)  { diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index f4c4138829f..7445407c010 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -642,6 +642,14 @@ void  glusterd_get_rebalance_volfile (glusterd_volinfo_t *volinfo,                                  char *path, int path_len); +void +glusterd_get_gfproxy_client_volfile (glusterd_volinfo_t *volinfo, +                                     char *path, int path_len); + +void +glusterd_get_gfproxyd_volfile (glusterd_volinfo_t *volinfo, +                               char *path, int path_len); +  int32_t  glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo,                          glusterd_brickinfo_t *dup_brickinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 25fb23f72b2..2344fd169f1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -58,6 +58,20 @@ extern struct volopt_map_entry glusterd_volopt_map[];          }                                                               \  } while (0 /* CONSTCOND */) +/** + * Needed for GFProxy + */ +#define GF_PROXY_DAEMON_PORT 40000 +#define GF_PROXY_DAEMON_PORT_STR "40000" + +static int +volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, +                            dict_t *set_dict, void *param); + +static int +build_client_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, +                    dict_t *mod_dict); +  /*********************************************   *   * xlator generation / graph manipulation API @@ -1448,6 +1462,75 @@ server_spec_extended_option_handler (volgen_graph_t *graph,  static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);  static int +gfproxy_server_graph_builder (volgen_graph_t *graph, +                                glusterd_volinfo_t *volinfo, +                                dict_t *set_dict, void *param) +{ +        xlator_t        *xl             = NULL; +        char            *value          = NULL; +        char            transt[16]      = {0, }; +        char            key[1024]       = {0, }; +        char            port_str[7]     = {0, }; +        int             ret             = 0; +        char            *username       = NULL; +        char            *password       = NULL; +        int             rclusters       = 0; + +        /* We are a trusted client */ +        ret = dict_set_uint32 (set_dict, "trusted-client", GF_CLIENT_TRUSTED); +        if (ret != 0) +                goto out; + +        ret = dict_set_str (set_dict, "gfproxy-server", "on"); +        if (ret != 0) +                goto out; + +        /* Build the client section of the graph first */ +        build_client_graph (graph, volinfo, set_dict); + +        /* Clear this setting so that future users of set_dict do not end up +         * thinking they are a gfproxy server */ +        dict_del (set_dict, "gfproxy-server"); +        dict_del (set_dict, "trusted-client"); + +        /* Then add the server to it */ +        get_vol_transport_type (volinfo, transt); +        xl = volgen_graph_add (graph, "protocol/server", volinfo->volname); +        if (!xl) +                goto out; + +        ret = xlator_set_option (xl, "listen-port", GF_PROXY_DAEMON_PORT_STR); +        if (ret != 0) +                goto out; + +        ret = xlator_set_option (xl, "transport-type", transt); +        if (ret != 0) +                goto out; + +        /* Set username and password */ +        username = glusterd_auth_get_username (volinfo); +        password = glusterd_auth_get_password (volinfo); +        if (username) { +                snprintf (key, sizeof (key), "auth.login.%s-server.allow", +                                volinfo->volname); +                ret = xlator_set_option (xl, key, username); +                if (ret) +                        return -1; +        } + +        if (password) { +                snprintf (key, sizeof (key), "auth.login.%s.password", +                                username); +                ret = xlator_set_option (xl, key, password); +                if (ret != 0) +                        goto out; +        } + +out: +        return ret; +} + +static int  brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                          dict_t *set_dict, glusterd_brickinfo_t *brickinfo)  { @@ -2541,6 +2624,48 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,  }  static int +gfproxy_server_perfxl_option_handler (volgen_graph_t *graph, +                                        struct volopt_map_entry *vme, +                                        void *param) +{ +        gf_boolean_t enabled = _gf_false; +        glusterd_volinfo_t *volinfo = NULL; + +        GF_ASSERT (param); +        volinfo = param; + +        /* write-behind is the *not* allowed for gfproxy-servers */ +        if (strstr (vme->key, "write-behind")) { +                return 0; +        } + +        perfxl_option_handler (graph, vme, param); + +        return 0; +} + +static int +gfproxy_client_perfxl_option_handler (volgen_graph_t *graph, +                                        struct volopt_map_entry *vme, +                                        void *param) +{ +        gf_boolean_t enabled = _gf_false; +        glusterd_volinfo_t *volinfo = NULL; + +        GF_ASSERT (param); +        volinfo = param; + +        /* write-behind is the only allowed "perf" for gfproxy-clients */ +        if (!strstr (vme->key, "write-behind")) +                return 0; + +        perfxl_option_handler (graph, vme, param); + +        return 0; +} + + +static int  nfsperfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,                         void *param)  { @@ -2768,8 +2893,10 @@ _free_xlator_opt_key (char *key)  }  static xlator_t * -volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, -                           char *hostname, char *subvol, char *xl_id, +volgen_graph_build_client (volgen_graph_t *graph, +                           glusterd_volinfo_t *volinfo, +                           char *hostname, char *port, +                           char *subvol, char *xl_id,                             char *transt, dict_t *set_dict)  {          xlator_t                *xl                 = NULL; @@ -2801,6 +2928,12 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                          goto err;          } +        if (port) { +                ret = xlator_set_option (xl, "remote-port", port); +                if (ret) +                        goto err; +        } +          ret = xlator_set_option (xl, "remote-subvolume", subvol);          if (ret)                  goto err; @@ -2824,7 +2957,8 @@ volgen_graph_build_client (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,          ret = dict_get_uint32 (set_dict, "trusted-client",                                 &client_type); -        if (!ret && client_type == GF_CLIENT_TRUSTED) { +        if (!ret && (client_type == GF_CLIENT_TRUSTED +              || client_type == GF_CLIENT_TRUSTED_PROXY)) {                  str = NULL;                  str = glusterd_auth_get_username (volinfo);                  if (str) { @@ -2911,7 +3045,9 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,          i = 0;          cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {                  xl = volgen_graph_build_client (graph, volinfo, -                                                brick->hostname, brick->path, +                                                brick->hostname, +                                                NULL, +                                                brick->path,                                                  brick->brick_id,                                                  transt, set_dict);                  if (!xl) { @@ -3143,8 +3279,9 @@ volgen_graph_build_snapview_client (volgen_graph_t *graph,          get_transport_type (volinfo, set_dict, transt, _gf_false); -        prot_clnt = volgen_graph_build_client (graph, volinfo, NULL, subvol, -                                               xl_id, transt, set_dict); +        prot_clnt = volgen_graph_build_client (graph, volinfo, +                                                NULL, NULL, subvol, +                                                xl_id, transt, set_dict);          if (!prot_clnt) {                  ret = -1;                  goto out; @@ -3555,6 +3692,27 @@ static int client_graph_set_perf_options(volgen_graph_t *graph,  {          data_t *tmp_data = NULL;          char *volname = NULL; +        int ret = 0; + +        /* +         * Logic to make sure gfproxy-client gets custom performance translators +         */ +        ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); +        if (ret == 1) { +                return volgen_graph_set_options_generic ( +                    graph, set_dict, volinfo, +                    &gfproxy_client_perfxl_option_handler); +        } + +        /* +         * Logic to make sure gfproxy-server gets custom performance translators +         */ +        ret = dict_get_str_boolean (set_dict, "gfproxy-server", 0); +        if (ret == 1) { +                return volgen_graph_set_options_generic ( +                        graph, set_dict, volinfo, +                        &gfproxy_server_perfxl_option_handler); +        }          /*           * Logic to make sure NFS doesn't have performance translators by @@ -3768,29 +3926,55 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,          char            *volname       = NULL;          glusterd_conf_t *conf          = THIS->private;          char            *tmp           = NULL; +        char            *hostname      = NULL;          gf_boolean_t     var           = _gf_false;          gf_boolean_t     ob            = _gf_false; +        gf_boolean_t    is_gfproxy     = _gf_false;          int              uss_enabled   = -1;          xlator_t        *this          = THIS; +        char            *subvol        = NULL; +        size_t          subvol_namelen = 0;          GF_ASSERT (this);          GF_ASSERT (conf); -        volname = volinfo->volname; -        ret = volgen_graph_build_clients (graph, volinfo, set_dict, -                                          param); -        if (ret) +        ret = dict_get_str_boolean (set_dict, "gfproxy-client", 0); +        if (ret == -1)                  goto out; -        if (volinfo->type == GF_CLUSTER_TYPE_TIER) -                ret = volume_volgen_graph_build_clusters_tier -                                        (graph, volinfo, _gf_false); -        else -                ret = volume_volgen_graph_build_clusters -                                        (graph, volinfo, _gf_false); +        volname = volinfo->volname; +        if (ret == 0) { +                ret = volgen_graph_build_clients (graph, volinfo, set_dict, +                                                  param); +                if (ret) +                        goto out; -        if (ret == -1) -                goto out; +                if (volinfo->type == GF_CLUSTER_TYPE_TIER) +                        ret = volume_volgen_graph_build_clusters_tier +                                                (graph, volinfo, _gf_false); +                else +                        ret = volume_volgen_graph_build_clusters +                                                (graph, volinfo, _gf_false); + +                if (ret == -1) +                        goto out; +        } else { +                is_gfproxy = _gf_true; +                ret = dict_get_str (set_dict, +                                        "config.gfproxyd-remote-host", &tmp); +                if (ret == -1) +                        goto out; + +                subvol_namelen = strlen (volinfo->volname) + +                                strlen ("-server") + 1; +                subvol = alloca (subvol_namelen); +                snprintf (subvol, subvol_namelen, +                                "%s-server", volinfo->volname); + +                volgen_graph_build_client (graph, volinfo, tmp, +                                           GF_PROXY_DAEMON_PORT_STR, subvol, +                                           "gfproxy", "tcp", set_dict); +        }          ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false);          if (ret == -1) @@ -3851,6 +4035,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                  }          } +        /* gfproxy needs the AHA translator */ +        if (is_gfproxy) { +                xl = volgen_graph_add (graph, "cluster/aha", volname); +                if (!xl) { +                        ret = -1; +                        goto out; +                } +        } +          if (conf->op_version == GD_OP_VERSION_MIN) {                  ret = glusterd_volinfo_get_boolean (volinfo,                                                      VKEY_FEATURES_QUOTA); @@ -4731,6 +4924,24 @@ out:          return ret;  } +static int +volgen_graph_set_iam_nfsd (const volgen_graph_t *graph) +{ +        xlator_t        *trav; +        int             ret = 0; + +        for (trav = first_of ((volgen_graph_t *)graph); trav; +                        trav = trav->next) { +                if (strcmp (trav->type, "cluster/replicate") != 0) +                        continue; + +                ret = xlator_set_option (trav, "iam-nfs-daemon", "yes"); +                if (ret) +                        break; +        } +        return ret; +} +  /* builds a graph for nfs server role, with option overrides in mod_dict */  int  build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) @@ -4869,6 +5080,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)                  if (ret)                          goto out; +                ret = volgen_graph_set_iam_nfsd (&cgraph); +                if (ret) +                        goto out; +                  ret = volgen_graph_merge_sub (graph, &cgraph, 1);                  if (ret)                          goto out; @@ -4930,6 +5145,22 @@ get_brick_filepath (char *filename, glusterd_volinfo_t *volinfo,                            brickinfo->hostname, brick);  } +static void +get_gfproxyd_filepath (char *filename, glusterd_volinfo_t *volinfo) +{ +        char  path[PATH_MAX]   = {0, }; +        char  brick[PATH_MAX]  = {0, }; +        glusterd_conf_t *priv  = NULL; + +        priv = THIS->private; + +        GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv); + +        snprintf (filename, PATH_MAX, +                        "%s/%s.gfproxyd.vol", path, +                        volinfo->volname); +} +  gf_boolean_t  glusterd_is_valid_volfpath (char *volname, char *brick)  { @@ -4975,6 +5206,32 @@ out:  }  static int +glusterd_generate_gfproxyd_volfile (glusterd_volinfo_t *volinfo) +{ +        volgen_graph_t graph = {0, }; +        char    filename[PATH_MAX] = {0, }; +        int     ret = -1; + +        GF_ASSERT (volinfo); + +        get_gfproxyd_filepath (filename, volinfo); + +        struct glusterd_gfproxyd_info info = { +                .port = GF_PROXY_DAEMON_PORT, +        }; + +        ret = build_graph_generic (&graph, volinfo, +                                   NULL, &info, +                                   &gfproxy_server_graph_builder); +        if (ret == 0) +                ret = volgen_write_volfile (&graph, filename); + +        volgen_graph_free (&graph); + +        return ret; +} + +static int  glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,                                   glusterd_brickinfo_t *brickinfo,                                   dict_t *mod_dict, void *data) @@ -5245,7 +5502,8 @@ glusterd_generate_client_per_brick_volfile (glusterd_volinfo_t *volinfo)          cds_list_for_each_entry (brick, &volinfo->bricks, brick_list) {                  xl = volgen_graph_build_client (&graph, volinfo, -                                                brick->hostname, brick->path, +                                                brick->hostname, +                                                NULL, brick->path,                                                  brick->brick_id,                                                  "tcp", dict);                  if (!xl) { @@ -5376,6 +5634,11 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo,                          ret = glusterd_get_trusted_client_filepath (filepath,                                                                      volinfo,                                                                      type); +                } else if (client_type == GF_CLIENT_TRUSTED_PROXY) { +                        glusterd_get_gfproxy_client_volfile (volinfo, +                                                             filepath, +                                                             PATH_MAX); +                        ret = dict_set_str (dict, "gfproxy-client", "on");                  } else {                          ret = glusterd_get_client_filepath (filepath,                                                              volinfo, @@ -5620,6 +5883,7 @@ build_bitd_volume_graph (volgen_graph_t *graph,                  xl = volgen_graph_build_client (&cgraph, volinfo,                                                  brickinfo->hostname, +                                                NULL,                                                  brickinfo->path,                                                  brickinfo->brick_id,                                                  transt, set_dict); @@ -5782,6 +6046,7 @@ build_scrub_volume_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                  xl = volgen_graph_build_client (&cgraph, volinfo,                                                  brickinfo->hostname, +                                                NULL,                                                  brickinfo->path,                                                  brickinfo->brick_id,                                                  transt, set_dict); @@ -5913,12 +6178,25 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo)                  goto out;          } +        ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED_PROXY); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not generate gfproxy client volfiles"); +                goto out; +        } +          ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER);          if (ret)                  gf_msg (this->name, GF_LOG_ERROR, 0,                          GD_MSG_VOLFILE_CREATE_FAIL,                          "Could not generate client volfiles"); + +         ret = glusterd_generate_gfproxyd_volfile (volinfo); +        if (ret) +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not generate gfproxy volfiles"); +  out:          return ret;  } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index f90177372dc..cb2cad50efc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -52,7 +52,8 @@  typedef enum {          GF_CLIENT_TRUSTED, -        GF_CLIENT_OTHER +        GF_CLIENT_OTHER, +        GF_CLIENT_TRUSTED_PROXY,  } glusterd_client_type_t;  struct volgen_graph { diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index bade4ffb06d..61c79655ccf 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -286,6 +286,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)          int32_t                 type        = 0;          char                   *username    = NULL;          char                   *password    = NULL; +#ifdef IPV6_DEFAULT +        char                   *addr_family = "inet6"; +#else +        char                   *addr_family = "inet"; +#endif          GF_ASSERT (req); @@ -388,10 +393,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req)                  /* Setting default as inet for trans_type tcp */                  ret = dict_set_dynstr_with_alloc (dict,                                  "transport.address-family", -                                "inet"); +                                addr_family);                  if (ret) {                          gf_log (this->name, GF_LOG_ERROR, -                                "failed to set transport.address-family"); +                                "failed to set transport.address-family " +                                "to %s", addr_family);                          goto out;                  }          } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 1e24adabe0c..bcb8877c5bd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1048,6 +1048,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = 1,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "cluster.min-free-strict-mode", +          .voltype    = "cluster/distribute", +          .op_version = 1, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          { .key        = "cluster.min-free-inodes",            .voltype    = "cluster/distribute",            .op_version = 1, @@ -1113,6 +1118,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .flags       = OPT_FLAG_CLIENT_OPT,          }, +        { .key = "cluster.du-refresh-interval-sec", +          .voltype = "cluster/distribute", +          .option = "du-refresh-interval-sec", +          .op_version = 1, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +          /* NUFA xlator options (Distribute special case) */          { .key        = "cluster.nufa",            .voltype    = "cluster/distribute", @@ -1461,6 +1473,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .option      = "thread-count",            .op_version  = 1          }, +        { .key         = "performance.io-thread-fops-per-thread-ratio", +          .voltype     = "performance/io-threads", +          .option      = "fops-per-thread-ratio", +          .op_version  = 1 +        },          { .key         = "performance.high-prio-threads",            .voltype     = "performance/io-threads",            .op_version  = 1 @@ -1555,6 +1572,18 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = 2,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "performance.write-behind-trickling-writes", +          .voltype    = "performance/write-behind", +          .option     = "trickling-writes", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "performance.nfs.write-behind-trickling-writes", +          .voltype    = "performance/write-behind", +          .option     = "trickling-writes", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          { .key        = "performance.lazy-open",            .voltype    = "performance/open-behind",            .option     = "lazy-open", @@ -2500,6 +2529,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .voltype     = "storage/posix",            .op_version  = GD_OP_VERSION_3_6_0,          }, +        { .key         = "storage.min-free-disk", +          .voltype     = "storage/posix", +          .op_version  = 2, +        }, +        { .key         = "storage.freespace-check-interval", +          .voltype     = "storage/posix", +          .op_version  = 2, +        },          { .key         = "storage.bd-aio",            .voltype     = "storage/bd",            .op_version  = 3 @@ -2515,6 +2552,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .option      = "!config",            .op_version  = 2          }, +        { .key         = "config.gfproxyd-remote-host", +          .voltype     = "configuration", +          .option      = "gfproxyd-remote-host", +          .op_version  = 2 +        },          { .key         = GLUSTERD_QUORUM_TYPE_KEY,            .voltype     = "mgmt/glusterd",            .value       = "off", @@ -2961,7 +3003,7 @@ struct volopt_map_entry glusterd_volopt_map[] = {          { .key        = "cluster.locking-scheme",            .voltype    = "cluster/replicate",            .type       = DOC, -          .op_version = GD_OP_VERSION_3_7_12, +          .op_version = GD_OP_VERSION_3_7_12 ,            .flags      = OPT_FLAG_CLIENT_OPT          },          { .key        = "cluster.granular-entry-heal", @@ -2970,6 +3012,72 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = GD_OP_VERSION_3_8_0,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .option      = "revocation-secs", +          .key         = "features.locks-revocation-secs", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_3_6_0, +        }, +        { .option      = "revocation-clear-all", +          .key         = "features.locks-revocation-clear-all", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_3_6_0, +        }, +        { .option      = "revocation-max-blocked", +          .key         = "features.locks-revocation-max-blocked", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_3_6_0, +        }, +        { .option      = "monkey-unlocking", +          .key         = "features.locks-monkey-unlocking", +          .voltype     = "features/locks", +          .op_version  = GD_OP_VERSION_3_6_0, +          .type        = NO_DOC, +        }, +        { .key        = "cluster.halo-enabled", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-hybrid-mode", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-failover-enabled", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-shd-max-latency", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-nfsd-max-latency", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-max-latency", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-max-replicas", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-min-replicas", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        }, +        { .key        = "cluster.halo-min-samples", +          .voltype    = "cluster/replicate", +          .op_version = 2, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          { .key         = NULL          }  }; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index bb6af7f378f..4795f958038 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -222,6 +222,11 @@ struct glusterd_brickinfo {  typedef struct glusterd_brickinfo glusterd_brickinfo_t; +struct glusterd_gfproxyd_info { +        short   port; +        char    *logfile; +}; +  struct gf_defrag_brickinfo_ {          char *name;          int   files; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 6c4cdfed062..598f62fee7a 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -186,6 +186,25 @@ start_glusterfs ()      fi  #options with values start here +    if [ -n "$halo_failover_enabled" ]; then +      cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-failover-enabled=$halo_failover_enabled"); +    fi +    if [ -n "$halo_max_latency" ]; then +      cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-latency=$halo_max_latency"); +    fi + +    if [ -n "$halo_max_replicas" ]; then +      cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-replicas=$halo_max_replicas"); +    fi + +    if [ -n "$halo_min_replicas" ]; then +      cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-min-replicas=$halo_min_replicas"); +    fi +      if [ -n "$log_level" ]; then          cmd_line=$(echo "$cmd_line --log-level=$log_level");      fi @@ -479,6 +498,18 @@ with_options()              [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"              fuse_mountopts="${fuse_mountopts}$key=\"$value\""              ;; +        "halo-max-latency") +            halo_max_latency=$value +            ;; +        "halo-max-replicas") +            halo_max_replicas=$value +            ;; +        "halo-min-replicas") +          halo_min_replicas=$value +          ;; +        "halo-failover-enabled") +          halo_failover_enabled=$value +          ;;          x-*)              # comments or userspace application-specific options, drop them              ;; diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h index bc9af2f0b8b..0079b9a3deb 100644 --- a/xlators/nfs/server/src/exports.h +++ b/xlators/nfs/server/src/exports.h @@ -22,7 +22,7 @@  #define GF_EXP GF_NFS"-exports"  #define NETGROUP_REGEX_PATTERN  "(@([a-zA-Z0-9\\(=, .])+)())" -#define HOSTNAME_REGEX_PATTERN  "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)" +#define HOSTNAME_REGEX_PATTERN  "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)"  #define OPTIONS_REGEX_PATTERN   "([a-zA-Z0-9=\\.]+)"  #define NETGROUP_MAX_LEN        128 diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c index 48b719d29aa..bff7e0669ff 100644 --- a/xlators/nfs/server/src/mount3.c +++ b/xlators/nfs/server/src/mount3.c @@ -1896,7 +1896,7 @@ _mnt3_get_host_from_peer (const char *peer_addr)          size_t host_len    = 0;          char   *colon      = NULL; -        colon = strchr (peer_addr, ':'); +        colon = strrchr (peer_addr, ':');          if (!colon) {                  gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER,                          "Bad peer %s", peer_addr); @@ -4123,6 +4123,15 @@ mnt1svc_init (xlator_t *nfsx)                  }          } +#ifdef IPV6_DEFAULT +        ret = dict_set_str (options, "transport.address-family", "inet6"); +        if (ret == -1) { +                gf_log (GF_NFS, GF_LOG_ERROR, +                        "dict_set_str error when trying to enable ipv6"); +              goto err; +        } +#endif +          ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);          if (ret == -1) {                  gf_msg (GF_NFS, GF_LOG_ERROR, errno, diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c index e8e226e953e..536a45ede3d 100644 --- a/xlators/nfs/server/src/mount3udp_svc.c +++ b/xlators/nfs/server/src/mount3udp_svc.c @@ -133,7 +133,15 @@ mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp)          mountres3               *res = NULL;          struct sockaddr_in      *sin = NULL; -        sin = svc_getcaller (transp); +        sin = (struct sockaddr_in *)svc_getcaller (transp); +        /* svc_getcaller returns a pointer to a sockaddr_in6, even though it +         * might actually be an IPv4 address. It ought return a struct sockaddr +         * and make the caller upcast it to the proper address family. Sigh. +         * +         * Let's make sure that it's actually an IPv4 address. +         */ +        GF_ASSERT (sin->sin_family == AF_INET); +          inet_ntop (AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN+1);          switch (rqstp->rq_proc) { diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c index af37f6b264c..a39a0e6ee3a 100644 --- a/xlators/nfs/server/src/nfs-common.c +++ b/xlators/nfs/server/src/nfs-common.c @@ -138,8 +138,12 @@ nfs_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)                          gf_uuid_copy (loc->gfid, inode->gfid);          } -        if (parent) +        if (parent) {                  loc->parent = inode_ref (parent); +                if (!gf_uuid_is_null (parent->gfid)) { +                        gf_uuid_copy (loc->pargfid, parent->gfid); +                } +        }          if (path) {                  loc->path = gf_strdup (path); diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c index ddfa89dab11..d5087f195ca 100644 --- a/xlators/nfs/server/src/nfs.c +++ b/xlators/nfs/server/src/nfs.c @@ -204,6 +204,9 @@ nfs_program_register_portmap_all (struct nfs_state *nfs)                  if (nfs->override_portnum)                          prog->progport = nfs->override_portnum;                  (void) rpcsvc_program_register_portmap (prog, prog->progport); +#ifdef IPV6_DEFAULT +                (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport); +#endif          }          return (0); @@ -339,6 +342,17 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)                                  if (version->required)                                          goto err;                          } +#ifdef IPV6_DEFAULT +                        ret = rpcsvc_program_register_rpcbind6 (prog, +                                                                prog->progport); +                        if (ret == -1) { +                                gf_msg (GF_NFS, GF_LOG_ERROR, 0, +                                        NFS_MSG_PGM_REG_FAIL, +                                        "Program (ipv6) %s registration failed", +                                        prog->progname); +                                goto err; +                        } +#endif                  }          } @@ -901,6 +915,16 @@ nfs_init_state (xlator_t *this)                  }          } +#ifdef IPV6_DEFAULT +        ret = dict_set_str (this->options, "transport.address-family", +                                           "inet6"); +        if (ret == -1) { +                gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error"); +                goto free_foppool; +        } +#endif + +          /* Right only socket support exists between nfs client and           * gluster nfs, so we can set default value as socket           */ @@ -2019,7 +2043,7 @@ struct volume_options options[] = {          },          { .key = {"nfs.mount-rmtab"},            .type = GF_OPTION_TYPE_PATH, -          .default_value = NFS_DATADIR "/rmtab", +          .default_value = "/-",            .description = "Set the location of the cache file that is used to "                           "list all the NFS-clients that have connected "                           "through the MOUNT protocol. If this is on shared " diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c index 64287c5b1bd..5aa9ea4e76e 100644 --- a/xlators/nfs/server/src/nfs3.c +++ b/xlators/nfs/server/src/nfs3.c @@ -372,6 +372,28 @@ out:          } while (0)                                                     \ +/* + * This macro checks if the volume is started or not. + * If it is not started, it closes the client connection & logs it. + * + * Why do we do this? + * + * There is a "race condition" where gNFSd may start listening for RPC requests + * prior to the volume being started. Presumably, that is why this macro exists + * in the first place. In the NFS kernel client (specifically Linux's NFS + * kernel client), they establish a TCP connection to our endpoint and + * (re-)send requests. If we ignore the request, and return nothing back, + * the NFS kernel client waits forever for our response. If for some reason, + * the TCP connection were to die, and re-establish, the requests are + * retransmitted and everything begins working as expected + * + * Now, this is clearly bad behavior on the client side, + * but in order to make every user's life easier, + * gNFSd should simply disconnect the TCP connection if it sees requests + * before it is ready to accept them. + * + */ +  #define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl)            \          do {                                                            \                if ((!nfs_subvolume_started (nfs_state (nf3stt->nfsx), vlm))){\ @@ -379,11 +401,32 @@ out:                                  NFS_MSG_VOL_DISABLE,                    \                                  "Volume is disabled: %s",               \                                  vlm->name);                             \ +                      nfs3_disconnect_transport (req->trans);           \                        rtval = RPCSVC_ACTOR_IGNORE;                      \                        goto erlbl;                                       \                }                                                         \          } while (0)                                                     \ +void +nfs3_disconnect_transport (rpc_transport_t *transport) +{ +        int ret = 0; + +        GF_VALIDATE_OR_GOTO (GF_NFS3, transport, out); + +        ret = rpc_transport_disconnect (transport); +        if (ret != 0) { +                gf_log (GF_NFS3, GF_LOG_WARNING, +                        "Unable to close client connection to %s.", +                        transport->peerinfo.identifier); +        } else { +                gf_log (GF_NFS3, GF_LOG_WARNING, +                        "Closed client connection to %s.", +                        transport->peerinfo.identifier); +        } +out: +        return; +}  int  nfs3_export_sync_trusted (struct nfs3_state *nfs3, uuid_t exportid) @@ -778,6 +821,12 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          cs = frame->local;          if (op_ret == -1) { +                /* Prevent crashes for the case where this call fails +                 * and buf is left in a NULL state, yet the op_errno == 0. +                 */ +                if (!buf && op_errno == 0) { +                        op_errno = EIO; +                }                  status = nfs3_cbk_errno_status (op_ret, op_errno);          } diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 72a82082563..79845316315 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -161,8 +161,6 @@ iot_worker (void *data)          THIS = this;          for (;;) { -                sleep_till.tv_sec = time (NULL) + conf->idle_time; -                  pthread_mutex_lock (&conf->mutex);                  {                          if (pri != -1) { @@ -176,7 +174,11 @@ iot_worker (void *data)                                  }                                  conf->sleep_count++; +                                clock_gettime (CLOCK_REALTIME_COARSE, +                                               &sleep_till); +                                sleep_till.tv_sec += conf->idle_time; +                                conf->sleep_count++;                                  ret = pthread_cond_timedwait (&conf->cond,                                                                &conf->mutex,                                                                &sleep_till); @@ -232,14 +234,25 @@ int  do_iot_schedule (iot_conf_t *conf, call_stub_t *stub, int pri)  {          int   ret = 0; +        int   active_count = 0;          pthread_mutex_lock (&conf->mutex);          {                  __iot_enqueue (conf, stub, pri); -                pthread_cond_signal (&conf->cond); - -                ret = __iot_workers_scale (conf); +                /* If we have an ample supply of threads alive already +                 * it's massively more efficient to keep the ones you have +                 * busy vs making new ones and signaling everyone +                 */ +                active_count = conf->curr_count - conf->sleep_count; +                if (conf->fops_per_thread_ratio == 0 || active_count == 0 || +                    (conf->queue_size/active_count > +                     conf->fops_per_thread_ratio && +                     active_count < conf->max_count)) { +                        pthread_cond_signal (&conf->cond); + +                        ret = __iot_workers_scale (conf); +                }          }          pthread_mutex_unlock (&conf->mutex); @@ -904,6 +917,9 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("thread-count", conf->max_count, options, int32, out); +        GF_OPTION_RECONF ("fops-per-thread-ratio", conf->fops_per_thread_ratio, +                          options, int32, out); +          GF_OPTION_RECONF ("high-prio-threads",                            conf->ac_iot_limit[IOT_PRI_HI], options, int32, out); @@ -978,6 +994,9 @@ init (xlator_t *this)          GF_OPTION_INIT ("thread-count", conf->max_count, int32, out); +        GF_OPTION_INIT ("fops-per-thread-ratio", conf->fops_per_thread_ratio, +                        int32, out); +          GF_OPTION_INIT ("high-prio-threads",                          conf->ac_iot_limit[IOT_PRI_HI], int32, out); @@ -1140,6 +1159,20 @@ struct volume_options options[] = {                           "perform concurrent IO operations"  	}, +        { .key  = {"fops-per-thread-ratio"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = IOT_MIN_FOP_PER_THREAD, +          .max  = IOT_MAX_FOP_PER_THREAD, +          .default_value = "20", +          .description = "The optimal ratio of threads to FOPs in the queue " +                         "we wish to achieve before creating a new thread. " +                         "The idea here is it's far cheaper to keep our " +                         "currently running threads busy than spin up " +                         "new threads or cause a stampeding herd of threads " +                         "to service a singlular FOP when you have a thread " +                         "which will momentarily become available to do the " +                         "work." +        },  	{ .key  = {"high-prio-threads"},  	  .type = GF_OPTION_TYPE_INT,  	  .min  = IOT_MIN_THREADS, diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index fa955b5954b..673e1967617 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -34,7 +34,9 @@ struct iot_conf;  #define IOT_MIN_THREADS         1  #define IOT_DEFAULT_THREADS     16 -#define IOT_MAX_THREADS         64 +#define IOT_MAX_THREADS         256 +#define IOT_MIN_FOP_PER_THREAD  0 +#define IOT_MAX_FOP_PER_THREAD  2000  #define IOT_THREAD_STACK_SIZE   ((size_t)(1024*1024)) @@ -62,6 +64,7 @@ struct iot_conf {          pthread_cond_t       cond;          int32_t              max_count;   /* configured maximum */ +        int32_t              fops_per_thread_ratio;          int32_t              curr_count;  /* actual number of threads running */          int32_t              sleep_count; diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 30443761c56..c3baafdc1b6 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -33,6 +33,7 @@ struct mdc_conf {  	gf_boolean_t cache_selinux;  	gf_boolean_t force_readdirp;          gf_boolean_t cache_swift_metadata; +        gf_boolean_t cache_all_xattrs;  }; @@ -792,6 +793,7 @@ struct checkpair {  static int  is_mdc_key_satisfied (const char *key)  { +        unsigned int checked_keys = 0;  	const char *mdc_key = NULL;  	int  i = 0; @@ -801,11 +803,13 @@ is_mdc_key_satisfied (const char *key)  	for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {  		if (!mdc_keys[i].load)  			continue; + +                checked_keys++;  		if (strcmp (mdc_key, key) == 0)  			return 1;  	} -	return 0; +        return 0;  } @@ -875,7 +879,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,          dict_t      *xattr_rsp = NULL;          dict_t      *xattr_alloc = NULL;          mdc_local_t *local = NULL; - +        struct mdc_conf *conf = this->private;          local = mdc_local_get (frame);          if (!local) @@ -899,10 +903,17 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,                  if (ret != 0)                          goto uncached; -                if (!mdc_xattr_satisfied (this, xdata, xattr_rsp)) +                /* Only check the keys if we are not caching all the xattrs */ +                if (!conf->cache_all_xattrs && +                    !mdc_xattr_satisfied (this, xdata, xattr_rsp)) {                          goto uncached; +                }          } +        gf_msg (this->name, GF_LOG_TRACE, 0, 0, +                "Returning lookup from cache for gfid %s", +                uuid_utoa(loc->inode->gfid)); +          MDC_STACK_UNWIND (lookup, frame, 0, 0, loc->inode, &stbuf,                            xattr_rsp, &postparent); @@ -1882,6 +1893,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,  	int           op_errno = ENODATA;          mdc_local_t  *local = NULL;  	dict_t       *xattr = NULL; +        struct mdc_conf *conf = this->private;          local = mdc_local_get (frame);          if (!local) @@ -1897,7 +1909,18 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,  		goto uncached;  	if (!xattr || !dict_get (xattr, (char *)key)) { -		ret = -1; +                /* If we can't find the extended attribute, & cache-all-xattrs +                 * is enabled, we should wind and try to find them. +                 * +                 * NOTE: Quota & AFR queries through the mount +                 * (i.e, virtual Gluster xattrs) +                 * won't work unless we do this. +                 */ +                if (conf->cache_all_xattrs) { +                        goto uncached; +                } + +                ret = -1;  		op_errno = ENODATA;  	} @@ -2363,7 +2386,8 @@ reconfigure (xlator_t *this, dict_t *options)  	GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out); - +        GF_OPTION_RECONF("cache-all-xattrs", conf->cache_all_xattrs, options, +                         bool, out);  out:  	return 0;  } @@ -2404,6 +2428,7 @@ init (xlator_t *this)                            conf->cache_swift_metadata);  	GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out); +        GF_OPTION_INIT ("cache-all-xattrs", conf->cache_all_xattrs, bool, out);  out:  	this->private = conf; @@ -2474,7 +2499,7 @@ struct volume_options options[] = {          { .key = {"md-cache-timeout"},            .type = GF_OPTION_TYPE_INT,            .min = 0, -          .max = 60, +          .max = 300,            .default_value = "1",            .description = "Time period after which cache has to be refreshed",          }, @@ -2484,5 +2509,19 @@ struct volume_options options[] = {  	  .description = "Convert all readdir requests to readdirplus to "  			 "collect stat info on each entry.",  	}, +        { .key = {"strict-xattrs"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "true", +          .description = "When reading extended attributes from the cache, " +                         "if an xattr is not found, attempt to find it by winding " +                         "instead of returning ENODATA. This is necessary to query " +                         "the special extended attributes (trusted.glusterfs.quota.size) " +                         "through a FUSE mount with md-cache enabled." +        }, +        { .key = {"cache-all-xattrs"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Cache all the extended attributes for an inode.", +        },      { .key = {NULL} },  }; diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 7f5719b1e48..bc59036ff88 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -169,6 +169,7 @@ typedef struct wb_request {  typedef struct wb_conf {          uint64_t         aggregate_size; +        uint64_t         page_size;          uint64_t         window_size;          gf_boolean_t     flush_behind;          gf_boolean_t     trickling_writes; @@ -1207,18 +1208,21 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)          char          *ptr    = NULL;          struct iobuf  *iobuf  = NULL;          struct iobref *iobref = NULL; +        struct wb_conf *conf = NULL;          int            ret    = -1;          ssize_t        required_size = 0;          size_t         holder_len = 0;          size_t         req_len = 0; +        conf = req->wb_inode->this->private; +          if (!holder->iobref) {                  holder_len = iov_length (holder->stub->args.vector,                                           holder->stub->args.count);                  req_len = iov_length (req->stub->args.vector,                                        req->stub->args.count); -                required_size = max ((THIS->ctx->page_size), +                required_size = max ((conf->page_size),                                       (holder_len + req_len));                  iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool,                                      required_size); @@ -1281,7 +1285,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)  	wb_request_t *holder          = NULL;  	wb_conf_t    *conf            = NULL;          int           ret             = 0; -	ssize_t       page_size       = 0;  	/* With asynchronous IO from a VM guest (as a file), there  	   can be two sequential writes happening in two regions @@ -1292,7 +1295,6 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)  	   through the interleaved ops  	*/ -	page_size = wb_inode->this->ctx->page_size;  	conf = wb_inode->this->private;          list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) { @@ -1343,7 +1345,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)                          continue;                  } -		space_left = page_size - holder->write_size; +		space_left = wb_inode->window_conf - holder->write_size;  		if (space_left < req->write_size) {  			holder->ordering.go = 1; @@ -2471,6 +2473,9 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64,                            out); +        GF_OPTION_RECONF ("cache-size", conf->page_size, options, size_uint64, +                          out); +          GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,                            out); @@ -2522,6 +2527,7 @@ init (xlator_t *this)          /* configure 'option window-size <size>' */          GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out); +        GF_OPTION_INIT ("cache-size", conf->page_size, size_uint64, out);          if (!conf->window_size && conf->aggregate_size) {                  gf_msg (this->name, GF_LOG_WARNING, 0, diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index dc6e244e717..7732a9711ae 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -15,6 +15,7 @@  #include "glusterfs.h"  #include "statedump.h"  #include "compat-errno.h" +#include "latency.h"  #include "glusterfs3.h"  #include "portmap-xdr.h" @@ -1549,7 +1550,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi          rpc_clnt_reconfig (conf->rpc, &config);          conf->skip_notify = 1; -	conf->quick_reconnect = 1; +        conf->quick_reconnect = 1;  out:          if (frame) diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 3cb5e231fbe..3e18b4870ae 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -467,7 +467,7 @@ int32_t  client_forget (xlator_t *this, inode_t *inode)  {          /* Nothing here */ -	return 0; +        return 0;  }  int32_t @@ -545,7 +545,7 @@ out:                  STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -571,7 +571,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -600,7 +600,7 @@ out:                  STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -628,7 +628,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -657,7 +657,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -687,7 +687,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -718,7 +718,7 @@ out:                  STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -748,7 +748,7 @@ out:                  STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -778,7 +778,7 @@ out:                  STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN,                                       NULL, NULL, NULL); -	return 0; +        return 0;  }  int32_t @@ -807,7 +807,7 @@ out:                  STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN,                                       NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -837,7 +837,7 @@ out:                  STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -867,7 +867,7 @@ out:                  STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -897,7 +897,7 @@ out:                  STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -932,7 +932,7 @@ out:                  STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN,                                       NULL, NULL, NULL, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -965,7 +965,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1000,7 +1000,7 @@ out:                  STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN,                                       NULL, 0, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -1038,7 +1038,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -1064,7 +1064,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1093,7 +1093,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  } @@ -1120,7 +1120,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1149,7 +1149,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1177,7 +1177,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1204,7 +1204,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  }  static gf_boolean_t @@ -1393,7 +1393,7 @@ out:          if (need_unwind)                  STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); -	return 0; +        return 0;  } @@ -1423,7 +1423,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1453,7 +1453,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1482,7 +1482,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1512,7 +1512,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1542,7 +1542,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1571,7 +1571,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  }  int32_t @@ -1598,7 +1598,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  }  int32_t @@ -1654,7 +1654,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1684,7 +1684,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1715,7 +1715,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1747,7 +1747,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1780,7 +1780,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL); -	return 0; +        return 0;  } @@ -1809,7 +1809,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL); -	return 0; +        return 0;  }  int32_t @@ -1840,7 +1840,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1872,7 +1872,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL); -	return 0; +        return 0;  } @@ -1901,7 +1901,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  }  int32_t @@ -1929,7 +1929,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL); -	return 0; +        return 0;  }  int32_t @@ -2155,7 +2155,7 @@ out:          if (ret)                  STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL); -	return 0; +        return 0;  } @@ -2227,6 +2227,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,          conf = this->private;          switch (event) { +        case RPC_CLNT_PING: +        { +                ret = default_notify (this, GF_EVENT_CHILD_PING, NULL); +                if (ret) +                        gf_log (this->name, GF_LOG_INFO, +                                "CHILD_PING notify failed"); +                conf->last_sent_event = GF_EVENT_CHILD_PING; +                break; +        }          case RPC_CLNT_CONNECT:          {                  conf->connected = 1; @@ -2312,13 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,                  conf->connected = 0;                  conf->skip_notify = 0; -                if (conf->quick_reconnect) { -                        conf->quick_reconnect = 0; -                        rpc_clnt_start (rpc); - -                } else { +                if (conf->rpc->conn.connected) { +                        /* Having conf->connected false and +                        * conf->rpc->conn.connected true is an +                        * unrecoverable state, since rpc_clnt_reconnect +                        * will do nothing for an already connected connection. +                        * A good fix would be to ensure serialized +                        * delivery of transport messages, but that is super hard +                        * and this is rare. So... ghetto "fix", disconnect the +                        * RPC and start the race again. Maybe we'll win +                        * next time! +                        */ +                        gf_log (this->name, GF_LOG_WARNING, +                                "Client %s reconnect race detected, " +                                "restarting.", conf->rpc->conn.name); +                        conf->quick_reconnect = 1; +                        rpc_transport_disconnect (rpc->conn.trans);                          rpc->conn.config.remote_port = 0; - +                } else { +                        if (conf->quick_reconnect) { +                                conf->quick_reconnect = 0; +                                rpc_clnt_start (rpc); +                        } else { +                                rpc->conn.config.remote_port = 0; +                        }                  }                  break; @@ -2670,7 +2696,7 @@ reconfigure (xlator_t *this, dict_t *options)          ret = 0;  out: -	return ret; +        return ret;  } @@ -2724,6 +2750,8 @@ init (xlator_t *this)          this->private = conf; +        this->client_latency.min = UINT64_MAX; +          /* If it returns -1, then its a failure, if it returns +1 we need             have to understand that 'this' is subvolume of a xlator which,             will set the remote host and remote subvolume in a setxattr @@ -3001,7 +3029,7 @@ struct volume_options options[] = {            .type  = GF_OPTION_TYPE_TIME,            .min   = 0,            .max   = 1013, -          .default_value = "42", +          .default_value = "180",            .description = "Time duration for which the client waits to "                           "check if the server is responsive."          }, diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c index d5410573ac3..c22f79fa872 100644 --- a/xlators/protocol/server/src/server-rpc-fops.c +++ b/xlators/protocol/server/src/server-rpc-fops.c @@ -33,6 +33,10 @@  void  forget_inode_if_no_dentry (inode_t *inode)  { +        if (!inode) { +                return; +        } +          if (!inode_has_dentry (inode))                  inode_forget (inode, 0); diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index d8ef5f7b73f..636108affbb 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          priv = this->private; +        if (!posix_write_ok (this, priv)) { +                op_errno = ENOSPC; +                goto err; +        } +          ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);          if (ret < 0) {                  gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 2c51d1967a8..8f85f8c8ba1 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -672,6 +672,81 @@ out:          return 0;  } +static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats, +                                 double min_free_disk, +                                 gf_boolean_t previously_ok) +{ +        gf_boolean_t currently_ok; + +        if (min_free_disk < 100.0) { +                double free_percent = 100.0 * stats->f_bavail / stats->f_blocks; + +                currently_ok = +                    free_percent >= min_free_disk ? _gf_true : _gf_false; +                if (previously_ok && !currently_ok) { +                        gf_log (this->name, GF_LOG_WARNING, +                               "min-free-disk limit exceeded: free percent " +                               "%f%% < %f%%. Writes disabled.", +                               free_percent, min_free_disk); +                } +        } else { +                double free_bytes = stats->f_bavail * stats->f_frsize; + +                currently_ok = +                    free_bytes >= min_free_disk ? _gf_true : _gf_false; +                if (previously_ok && !currently_ok) { +                        gf_log (this->name, GF_LOG_WARNING, +                               "min-free-disk limit exceeded: free bytes %f " +                               "< %f. Writes disabled.", +                               free_bytes, min_free_disk); +                } +        } + +        if (currently_ok && !previously_ok) { +                gf_log (this->name, GF_LOG_INFO, "Free space has risen above " +                                                "min-free-disk limit, writes " +                                                "re-enabled."); +        } + +        return currently_ok; +} + +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv) +{ +        /* Check if there is sufficient free space to allow writes. +         * +         * This is called in the write path, so performance matters. We +         * periodically sample free space by calling statvfs(). +         * freespace_check_lock is used to ensure only one process at a +         * time makes the call; if the lock is contended, the previous +         * status (reflected in freespace_check_passed) is used while +         * the process that holds the mutex updates the current status. +         */ +        if (!priv->freespace_check_interval) { +                return _gf_true; +        } + +        if (!pthread_mutex_trylock (&priv->freespace_check_lock)) { +                struct timespec now; + +                clock_gettime (CLOCK_MONOTONIC, &now); +                if (now.tv_sec >= priv->freespace_check_last.tv_sec + +                                      priv->freespace_check_interval) { +                        sys_statvfs (priv->base_path, &priv->freespace_stats); +                        priv->freespace_check_last.tv_sec = now.tv_sec; + +                        priv->freespace_check_passed = freespace_ok ( +                            this, &priv->freespace_stats, priv->min_free_disk, +                            priv->freespace_check_passed); +                } + +                pthread_mutex_unlock (&priv->freespace_check_lock); +        } + +        return priv->freespace_check_passed; +} +  static int32_t  posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,                      int32_t flags, off_t offset, size_t len, @@ -681,6 +756,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,          int32_t             op_errno = 0;          struct posix_fd    *pfd    = NULL;          gf_boolean_t        locked = _gf_false; +        struct posix_private *priv = this->private;          DECLARE_OLD_FS_ID_VAR; @@ -689,6 +765,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,          VALIDATE_OR_GOTO (frame, out);          VALIDATE_OR_GOTO (this, out);          VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (priv, out); + +        if (!posix_write_ok (this, priv)) { +                ret = -ENOSPC; +                goto out; +        }          ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);          if (ret < 0) { @@ -3321,6 +3403,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          VALIDATE_OR_GOTO (priv, out); +        if (!posix_write_ok (this, priv)) { +                op_errno = ENOSPC; +                op_ret = -1; +                goto out; +        } +          ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);          if (ret < 0) {                  gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, @@ -6685,6 +6773,16 @@ struct posix_private *priv = NULL;                            options, uint32, out);          posix_spawn_health_check_thread (this); +        pthread_mutex_lock (&priv->freespace_check_lock); +        { +                GF_OPTION_RECONF ("freespace-check-interval", +                                   priv->freespace_check_interval, +                                   options, uint32, out); +                GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options, +                                  percent_or_size, out); +        } +        pthread_mutex_unlock (&priv->freespace_check_lock); +  	ret = 0;  out:  	return ret; @@ -7299,6 +7397,19 @@ init (xlator_t *this)          GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,                          uint32, out); + +        GF_OPTION_INIT ("freespace-check-interval", +                        _private->freespace_check_interval, uint32, out); + +        GF_OPTION_INIT ("min-free-disk", _private->min_free_disk, +                        percent_or_size, out); + +        pthread_mutex_init (&_private->freespace_check_lock, NULL); +        sys_statvfs (_private->base_path, &_private->freespace_stats); +        clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last); +        _private->freespace_check_passed = freespace_ok ( +                this, &_private->freespace_stats, _private->min_free_disk, +                _gf_true);  out:          return ret;  } @@ -7476,5 +7587,22 @@ struct volume_options options[] = {  	  "\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n"          },  #endif +        { .key  = {"min-free-disk"}, +          .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, +          .default_value = "2%", +          .description = "Minimum percentage/size of disk space, after which we" +                         "start failing writes with ENOSPC." +        }, +        { +          .key = {"freespace-check-interval"}, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, +          .default_value = "5", +          .validate = GF_OPT_VALIDATE_MIN, +          .description = "Interval in seconds between freespace measurements " +                         "used for the min-free-disk determination. " +                         "Set to 0 to disable." +        }, +          { .key  = {NULL} }  }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 87f91e57747..ef4bc66ecbc 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -174,7 +174,14 @@ struct posix_private {                  XATTR_BOTH,          } xattr_user_namespace;  #endif - +        /* freespace_check_lock protects access to following three fields. */ +        pthread_mutex_t freespace_check_lock; +        struct timespec freespace_check_last; +        struct statvfs freespace_stats; +        double min_free_disk; +        /* mutex protection ends. */ +        uint32_t freespace_check_interval; +        gf_boolean_t freespace_check_passed;  };  typedef struct { @@ -263,6 +270,9 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode,  void  posix_gfid_unset (xlator_t *this, dict_t *xdata); +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv); +  int  posix_pacl_set (const char *path, const char *key, const char *acl_s);  | 
