diff options
38 files changed, 908 insertions, 142 deletions
diff --git a/cli/src/cli.c b/cli/src/cli.c index 25fe58568a5..ba2ac43d7f3 100644 --- a/cli/src/cli.c +++ b/cli/src/cli.c @@ -613,6 +613,11 @@ cli_rpc_init (struct cli_state *state) int ret = -1; int port = CLI_GLUSTERD_PORT; xlator_t *this = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif this = THIS; cli_rpc_prog = &cli_prog; @@ -648,7 +653,8 @@ cli_rpc_init (struct cli_state *state) goto out; ret = dict_set_str (options, "transport.address-family", - "inet"); + addr_family); + if (ret) goto out; } diff --git a/configure.ac b/configure.ac index ee245c90a43..b181841ad17 100644 --- a/configure.ac +++ b/configure.ac @@ -304,7 +304,14 @@ if test "x$enable_debug" = "xyes"; then CFLAGS="${CFLAGS} -g -O0 -DDEBUG" else BUILD_DEBUG=no - CFLAGS="${CFLAGS} -g -O2" + CFLAGS="${CFLAGS} -g" +fi + +AC_ARG_WITH([fbextras], AC_HELP_STRING([--with-fbextras], [Enable Facebook specific extras.])) +if test "x$with_fbextras" = "xyes"; then + BUILD_FBEXTRAS=yes +else + BUILD_FBEXTRAS=no fi AC_ARG_ENABLE([privport_tracking], @@ -1061,6 +1068,13 @@ AC_SUBST(GF_DISTRIBUTION) GF_HOST_OS="" GF_LDFLAGS="-rdynamic" +dnl include tirpc for FB builds +if test "x$BUILD_FBEXTRAS" = "xyes"; then + TIRPC_CFLAGS="-I/usr/include/tirpc" + GF_LDFLAGS="-lfbtirpc $GF_LDFLAGS" + GF_CFLAGS="$GF_CFLAGS $TIRPC_CFLAGS -DIPV6_DEFAULT" +fi + dnl check for gcc -Werror=format-security saved_CFLAGS=$CFLAGS CFLAGS="-Wformat -Werror=format-security" @@ -1539,6 +1553,7 @@ AC_SUBST([GF_CPPFLAGS]) AM_CONDITIONAL([GF_LINUX_HOST_OS], test "${GF_HOST_OS}" = "GF_LINUX_HOST_OS") AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS") AM_CONDITIONAL([GF_BSD_HOST_OS], test "${GF_HOST_OS}" = "GF_BSD_HOST_OS") +AM_CONDITIONAL([GF_FBEXTRAS], test "${BUILD_FBEXTRAS}" = "yes") AC_SUBST(GLUSTERD_WORKDIR) AM_CONDITIONAL([GF_INSTALL_GLUSTERD_WORKDIR], test ! -d ${GLUSTERD_WORKDIR} && test -d ${sysconfdir}/glusterd ) diff --git a/glusterfs.spec.in b/glusterfs.spec.in index 7875638f849..fb5f35971ac 100644 --- a/glusterfs.spec.in +++ b/glusterfs.spec.in @@ -17,6 +17,10 @@ # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with valgrind %{?_with_valgrind:%global _with_valgrind --enable-valgrind} +# if you wish to compile an rpm with Facebook specfic extras... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with fbextras +%{?_with_fbextras:%global _with_fbextras --with-fbextras} + # if you wish to compile an rpm with cmocka unit testing... # rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --with cmocka %{?_with_cmocka:%global _with_cmocka --enable-cmocka} @@ -215,6 +219,9 @@ BuildRequires: python2-devel %if ( 0%{?fedora} && 0%{?fedora} < 26 ) || ( 0%{?rhel} ) BuildRequires: python-ctypes %endif +%if ( 0%{?_with_fbextras:1} ) +BuildRequires: fb-libtirpc fb-libtirpc-devel +%endif BuildRequires: userspace-rcu-devel >= 0.7 %if ( 0%{?rhel} && 0%{?rhel} <= 6 ) BuildRequires: automake @@ -549,6 +556,9 @@ Requires: %{name}-cli%{?_isa} = %{version}-%{release} Requires: %{name}-libs%{?_isa} = %{version}-%{release} # some daemons (like quota) use a fuse-mount, glusterfsd is part of -fuse Requires: %{name}-fuse%{?_isa} = %{version}-%{release} +%if ( 0%{?_with_fbextras:1} ) +Requires: fb-libtirpc >= 0.2.5-1 +%endif # self-heal daemon, rebalance, nfs-server etc. are actually clients Requires: %{name}-api%{?_isa} = %{version}-%{release} Requires: %{name}-client-xlators%{?_isa} = %{version}-%{release} @@ -665,7 +675,9 @@ export CFLAGS %{?_without_ocf} \ %{?_without_rdma} \ %{?_without_syslog} \ - %{?_without_tiering} + %{?_without_tiering} \ + %{?_without_events} \ + %{?_with_fbextras} # fix hardening and remove rpath in shlibs %if ( 0%{?fedora} && 0%{?fedora} > 17 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 56f545f28de..2acd83f36cf 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -194,26 +194,16 @@ gf_rev_dns_lookup (const char *ip) { char *fqdn = NULL; int ret = 0; - struct sockaddr_in sa = {0}; - char host_addr[256] = {0, }; GF_VALIDATE_OR_GOTO ("resolver", ip, out); - sa.sin_family = AF_INET; - inet_pton (AF_INET, ip, &sa.sin_addr); - ret = getnameinfo ((struct sockaddr *)&sa, sizeof (sa), host_addr, - sizeof (host_addr), NULL, 0, 0); - + /* Get the FQDN */ + ret = gf_get_hostname_from_ip ((char *)ip, &fqdn); if (ret != 0) { gf_msg ("resolver", GF_LOG_INFO, errno, LG_MSG_RESOLVE_HOSTNAME_FAILED, "could not resolve " "hostname for %s", ip); - goto out; } - - /* Get the FQDN */ - fqdn = gf_strdup (host_addr); - out: return fqdn; } @@ -3127,11 +3117,13 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) char *client_ip_copy = NULL; char *tmp = NULL; char *ip = NULL; + size_t addr_sz = 0; /* if ipv4, reverse lookup the hostname to * allow FQDN based rpc authentication */ - if (valid_ipv4_address (client_ip, strlen (client_ip), 0) == _gf_false) { + if (!valid_ipv6_address (client_ip, strlen (client_ip), 0) && + !valid_ipv4_address (client_ip, strlen (client_ip), 0)) { /* most times, we get a.b.c.d:port form, so check that */ client_ip_copy = gf_strdup (client_ip); if (!client_ip_copy) @@ -3144,12 +3136,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) if (valid_ipv4_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *)&client_sock_in; + addr_sz = sizeof (client_sock_in); client_sock_in.sin_family = AF_INET; ret = inet_pton (AF_INET, ip, (void *)&client_sock_in.sin_addr.s_addr); } else if (valid_ipv6_address (ip, strlen (ip), 0) == _gf_true) { client_sockaddr = (struct sockaddr *) &client_sock_in6; + addr_sz = sizeof (client_sock_in6); client_sock_in6.sin6_family = AF_INET6; ret = inet_pton (AF_INET6, ip, @@ -3163,8 +3157,14 @@ gf_get_hostname_from_ip (char *client_ip, char **hostname) goto out; } + /* You cannot just use sizeof (*client_sockaddr), as per the man page + * the (getnameinfo) size must be the size of the underlying sockaddr + * struct e.g. sockaddr_in6 or sockaddr_in. Failure to do so will + * break IPv6 hostname resolution (IPv4 will work only because + * the sockaddr_in struct happens to be of the correct size). + */ ret = getnameinfo (client_sockaddr, - sizeof (*client_sockaddr), + addr_sz, client_hostname, sizeof (client_hostname), NULL, 0, 0); if (ret) { diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h index fbaac76b9ee..0f61aaae55e 100644 --- a/libglusterfs/src/compat.h +++ b/libglusterfs/src/compat.h @@ -479,6 +479,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0); #endif +#ifndef IPV6_DEFAULT + #ifndef IXDR_GET_LONG #define IXDR_GET_LONG(buf) ((long)IXDR_GET_U_INT32(buf)) #endif @@ -495,6 +497,8 @@ int gf_mkostemp (char *tmpl, int suffixlen, int flags); #define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG(buf, (long)(v)) #endif +#endif /* IPV6_DEFAULT */ + #if defined(__GNUC__) && !defined(RELAX_POISONING) /* Use run API, see run.h */ #include <stdlib.h> /* system(), mkostemp() */ diff --git a/libglusterfs/src/defaults-tmpl.c b/libglusterfs/src/defaults-tmpl.c index 5b7578b7c57..0889e07caa7 100644 --- a/libglusterfs/src/defaults-tmpl.c +++ b/libglusterfs/src/defaults-tmpl.c @@ -170,6 +170,18 @@ default_notify (xlator_t *this, int32_t event, void *data, ...) } } break; + case GF_EVENT_CHILD_PING: + { + xlator_list_t *parent = this->parents; + + while (parent) { + if (parent->xlator->init_succeeded) + xlator_notify (parent->xlator, event, + this, data); + parent = parent->next; + } + } + break; default: { xlator_list_t *parent = this->parents; diff --git a/libglusterfs/src/latency.c b/libglusterfs/src/latency.c index 5025de6c8cf..1d75f5b98ce 100644 --- a/libglusterfs/src/latency.c +++ b/libglusterfs/src/latency.c @@ -21,6 +21,7 @@ #include "statedump.h" #include "libglusterfs-messages.h" +static int gf_set_fop_from_fn_pointer_warning; void gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void *fn) { @@ -108,8 +109,15 @@ gf_set_fop_from_fn_pointer (call_frame_t *frame, struct xlator_fops *fops, void fop = GF_FOP_READDIRP; else if (fops->getspec == *(fop_getspec_t *)&fn) fop = GF_FOP_GETSPEC; - else - fop = -1; + else if (fops->ipc == *(fop_ipc_t *)&fn) + fop = GF_FOP_IPC; + else { + fop = GF_FOP_NULL; + GF_LOG_OCCASIONALLY(gf_set_fop_from_fn_pointer_warning, + "latency", + GF_LOG_WARNING, + "Unknown FOP type"); + } frame->op = fop; } diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index 59c7e77d548..d5b2fcd64bd 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -173,6 +173,7 @@ enum gf_common_mem_types_ { gf_common_mt_tbf_bucket_t, gf_common_mt_tbf_throttle_t, gf_common_mt_pthread_t, + gf_common_ping_local_t, gf_common_mt_end }; #endif diff --git a/libglusterfs/src/timespec.c b/libglusterfs/src/timespec.c index f7b2bea2f30..903303d1380 100644 --- a/libglusterfs/src/timespec.c +++ b/libglusterfs/src/timespec.c @@ -60,3 +60,15 @@ void timespec_adjust_delta (struct timespec *ts, struct timespec delta) ts->tv_sec += ((ts->tv_nsec + delta.tv_nsec) / 1000000000); ts->tv_sec += delta.tv_sec; } + +void timespec_sub (const struct timespec *begin, const struct timespec *end, + struct timespec *res) +{ + if (end->tv_nsec < begin->tv_nsec) { + res->tv_sec = end->tv_sec - begin->tv_sec - 1; + res->tv_nsec = end->tv_nsec + 1000000000 - begin->tv_nsec; + } else { + res->tv_sec = end->tv_sec - begin->tv_sec; + res->tv_nsec = end->tv_nsec - begin->tv_nsec; + } +} diff --git a/libglusterfs/src/timespec.h b/libglusterfs/src/timespec.h index f37194b97cf..9c393ee7166 100644 --- a/libglusterfs/src/timespec.h +++ b/libglusterfs/src/timespec.h @@ -20,5 +20,8 @@ void timespec_now (struct timespec *ts); void timespec_adjust_delta (struct timespec *ts, struct timespec delta); +void timespec_sub (const struct timespec *begin, + const struct timespec *end, + struct timespec *res); #endif /* __INCLUDE_TIMESPEC_H__ */ diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index c2959efbd95..e0f82a70764 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -933,6 +933,7 @@ struct _xlator { gf_loglevel_t loglevel; /* Log level for translator */ + int64_t client_latency; /* for latency measurement */ fop_latency_t latencies[GF_FOP_MAXVALUE]; diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index e042121ad47..5a97f4bb9cf 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -18,6 +18,7 @@ #include "mem-pool.h" #include "xdr-rpc.h" #include "rpc-common-xdr.h" +#include "timespec.h" char *clnt_ping_procs[GF_DUMP_MAXVALUE] = { @@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = { .procnames = clnt_ping_procs, }; +struct ping_local { + struct rpc_clnt *rpc; + struct timespec submit_time; +}; + /* Must be called under conn->lock */ static int __rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk) @@ -170,11 +176,17 @@ int rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { - struct rpc_clnt *rpc = NULL; + struct ping_local *local = NULL; xlator_t *this = NULL; rpc_clnt_connection_t *conn = NULL; call_frame_t *frame = NULL; int unref = 0; + gf_boolean_t call_notify = _gf_false; + + struct timespec now; + struct timespec delta; + int64_t latency_msec = 0; + int ret = 0; if (!myframe) { gf_log (THIS->name, GF_LOG_WARNING, @@ -184,14 +196,23 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, frame = myframe; this = frame->this; - rpc = frame->local; - frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */ - conn = &rpc->conn; + local = frame->local; + conn = &local->rpc->conn; + + timespec_now (&now); + timespec_sub (&local->submit_time, &now, &delta); + latency_msec = delta.tv_sec * 1000 + delta.tv_nsec / 1000000; pthread_mutex_lock (&conn->lock); { + this->client_latency = latency_msec; + gf_log (THIS->name, GF_LOG_DEBUG, + "Ping latency is %" PRIu64 "ms", + latency_msec); + + call_notify = _gf_true; if (req->rpc_status == -1) { - unref = rpc_clnt_remove_ping_timer_locked (rpc); + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (unref) { gf_log (this->name, GF_LOG_WARNING, "socket or ib related error"); @@ -206,8 +227,8 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, goto unlock; } - unref = rpc_clnt_remove_ping_timer_locked (rpc); - if (__rpc_clnt_rearm_ping_timer (rpc, + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); + if (__rpc_clnt_rearm_ping_timer (local->rpc, rpc_clnt_start_ping) == -1) { gf_log (this->name, GF_LOG_WARNING, "failed to set the ping timer"); @@ -216,12 +237,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, } unlock: pthread_mutex_unlock (&conn->lock); + + if (call_notify) { + ret = local->rpc->notifyfn (local->rpc, this, RPC_CLNT_PING, + (void *)(uintptr_t)latency_msec); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "RPC_CLNT_PING notify failed"); + } + } out: if (unref) - rpc_clnt_unref (rpc); + rpc_clnt_unref (local->rpc); - if (frame) + if (frame) { + GF_FREE (frame->local); + frame->local = NULL; STACK_DESTROY (frame->root); + } return 0; } @@ -231,18 +264,28 @@ rpc_clnt_ping (struct rpc_clnt *rpc) call_frame_t *frame = NULL; int32_t ret = -1; rpc_clnt_connection_t *conn = NULL; + struct ping_local *local = NULL; conn = &rpc->conn; + local = GF_CALLOC (1, sizeof(struct ping_local), + gf_common_ping_local_t); + if (!local) + return ret; frame = create_frame (THIS, THIS->ctx->pool); - if (!frame) + if (!frame) { + GF_FREE (local); return ret; + } - frame->local = rpc; + local->rpc = rpc; + timespec_now (&local->submit_time); + frame->local = local; ret = rpc_clnt_submit (rpc, &clnt_ping_prog, GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0, NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL); if (ret) { + /* FIXME: should we free the frame here? Methinks so! */ gf_log (THIS->name, GF_LOG_ERROR, "failed to start ping timer"); } diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index b731ba2dfad..e5e493cd79e 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -19,6 +19,7 @@ typedef enum { RPC_CLNT_CONNECT, RPC_CLNT_DISCONNECT, + RPC_CLNT_PING, RPC_CLNT_MSG, RPC_CLNT_DESTROY } rpc_clnt_event_t; diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c index 4fbbc8cd7fc..4c3d5279fe1 100644 --- a/rpc/rpc-lib/src/rpc-transport.c +++ b/rpc/rpc-lib/src/rpc-transport.c @@ -656,6 +656,11 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, dict_t *dict = NULL; char *host = NULL; int ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif GF_ASSERT (options); GF_ASSERT (hostname); @@ -686,6 +691,13 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, goto out; } + ret = dict_set_str (dict, "address-family", addr_family); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set address-family to %s", addr_family); + goto out; + } + ret = dict_set_str (dict, "transport-type", "socket"); if (ret) { gf_log (THIS->name, GF_LOG_WARNING, diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c index 2be54a382f9..4cd2be17085 100644 --- a/rpc/rpc-lib/src/rpcsvc.c +++ b/rpc/rpc-lib/src/rpcsvc.c @@ -37,6 +37,10 @@ #include <stdarg.h> #include <stdio.h> +#ifdef IPV6_DEFAULT +#include <netconfig.h> +#endif + #include "xdr-rpcclnt.h" #include "glusterfs-acl.h" @@ -1386,6 +1390,82 @@ rpcsvc_error_reply (rpcsvc_request_t *req) return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL); } +#ifdef IPV6_DEFAULT +int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port) +{ + const int IP_BUF_LEN = 64; + char addr_buf[IP_BUF_LEN]; + + int err = 0; + bool_t success = 0; + struct netconfig *nc; + struct netbuf *nb; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + + err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff, + port & 0xff); + if (err < 0) { + err = -1; + goto out; + } + + nb = uaddr2taddr (nc, addr_buf); + if (!nb) { + err = -1; + goto out; + } + + success = rpcb_set (newprog->prognum, newprog->progver, nc, nb); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6" + " service with rpcbind"); + } + + err = 0; + +out: + return err; +} + +int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog) +{ + int err = 0; + bool_t success = 0; + struct netconfig *nc; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + success = rpcb_unset (newprog->prognum, newprog->progver, nc); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6" + " service with rpcbind"); + } + + err = 0; +out: + return err; +} +#endif /* Register the program with the local portmapper service. */ int @@ -1550,7 +1630,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) " program failed"); goto out; } - +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_unregister_rpcbind6 (program); + if (ret == -1) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)" + " unregistration of program failed"); + goto out; + } +#endif pthread_mutex_lock (&svc->rpclock); { list_for_each_entry (prog, &svc->programs, program) { diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h index cf3e5906de1..8097a525dab 100644 --- a/rpc/rpc-lib/src/rpcsvc.h +++ b/rpc/rpc-lib/src/rpcsvc.h @@ -434,6 +434,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener); extern int rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port); +#ifdef IPV6_DEFAULT +extern int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port); +#endif + extern int rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog); diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c index e9de1a78813..56e9bb8d31b 100644 --- a/rpc/rpc-transport/socket/src/name.c +++ b/rpc/rpc-transport/socket/src/name.c @@ -562,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) data_t *address_family_data = NULL; int32_t ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; + sa_family_t default_family = AF_INET6; +#else + char *addr_family = "inet"; + sa_family_t default_family = AF_INET; +#endif + GF_VALIDATE_OR_GOTO ("socket", sa_family, out); address_family_data = dict_get (this->options, @@ -586,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) } } else { gf_log (this->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting to inet"); - *sa_family = AF_INET; + "option address-family not specified, " + "defaulting to %s", addr_family); + *sa_family = default_family; } ret = 0; diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index e6e73bf8f22..051022290fa 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -3101,6 +3101,21 @@ socket_connect (rpc_transport_t *this, int port) } } + /* Make sure we are not vulnerable to someone setting + * net.ipv6.bindv6only to 1 so that gluster services are + * avalable over IPv4 & IPv6. + */ + int disable_v6only = 0; + + if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&disable_v6only, + sizeof (disable_v6only)) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Error disabling sockopt IPV6_V6ONLY: \"%s\"", + strerror (errno)); + } + + if (priv->nodelay && (sa_family != AF_UNIX)) { ret = __socket_nodelay (priv->sock); diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x index 7b0bcb33213..5b7fe001f1c 100644 --- a/rpc/xdr/src/glusterfs-fops.x +++ b/rpc/xdr/src/glusterfs-fops.x @@ -103,6 +103,7 @@ enum glusterfs_event_t { GF_EVENT_SOME_DESCENDENT_DOWN, GF_EVENT_SCRUB_ONDEMAND, GF_EVENT_SOME_DESCENDENT_UP, + GF_EVENT_CHILD_PING, GF_EVENT_MAXVAL }; diff --git a/tests/basic/exports_parsing.t b/tests/basic/exports_parsing.t index fdaf9c2822e..da88bbcb2cc 100644 --- a/tests/basic/exports_parsing.t +++ b/tests/basic/exports_parsing.t @@ -32,7 +32,20 @@ function test_bad_opt () glusterfsd --print-exports $1 2>&1 | sed -n 1p } -EXPECT_KEYWORD "/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,)" test_good_file $EXP_FILES/exports +function check_export_line() { + if [ "$1" == "$2" ]; then + echo "Y" + else + echo "N" + fi + return +} + +export_result=$(test_good_file $EXP_FILES/exports) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 10.35.11.31(rw,anonuid=0,sec=sys,) ' "$export_result" + +export_result=$(test_good_file $EXP_FILES/exports-v6) +EXPECT "Y" check_export_line '/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) ' "$export_result" EXPECT_KEYWORD "Error parsing netgroups for:" test_bad_line $EXP_FILES/bad_exports EXPECT_KEYWORD "Error parsing netgroups for:" test_long_netgroup $EXP_FILES/bad_exports diff --git a/tests/basic/mount-nfs-auth.t b/tests/basic/mount-nfs-auth.t index 9df5cb45c3b..cd0189788ba 100755 --- a/tests/basic/mount-nfs-auth.t +++ b/tests/basic/mount-nfs-auth.t @@ -15,6 +15,9 @@ TEST glusterd TEST pidof glusterd TEST $CLI volume info +H0IP=$(ip addr show |grep -w inet |grep -v 127.0.0.1|awk '{ print $2 }'| cut -d "/" -f 1) +H0IP6=$(host $HOSTNAME | grep IPv6 | awk '{print $NF}') + # Export variables for allow & deny EXPORT_ALLOW="/$V0 $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" EXPORT_ALLOW_SLASH="/$V0/ $H0(sec=sys,rw,anonuid=0) @ngtop(sec=sys,rw,anonuid=0)" @@ -37,6 +40,10 @@ function build_dirs () { mkdir -p $B0/b{0,1,2}/L1/L2/L3 } +function export_allow_this_host_ipv6 () { + printf "$EXPORT_ALLOW6\n" > /var/lib/glusterd/nfs/exports +} + function export_allow_this_host () { printf "$EXPORT_ALLOW\n" > ${NFSDIR}/exports } @@ -186,6 +193,11 @@ EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available ## Mount NFS EXPECT "Y" check_mount_success $V0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0 + +## Mount NFS using the IPv6 export +export_allow_this_host_ipv6 +EXPECT "Y" check_mount_success $V0 ## Disallow host TEST export_deny_this_host diff --git a/tests/configfiles/exports-v6 b/tests/configfiles/exports-v6 new file mode 100644 index 00000000000..426b1ef5705 --- /dev/null +++ b/tests/configfiles/exports-v6 @@ -0,0 +1 @@ +/test @test(rw,anonuid=0,sec=sys,) 2401:db00:11:1:face:0:3d:0(rw,anonuid=0,sec=sys,) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index ac834e90f4b..17943d7baae 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -2531,7 +2531,6 @@ unwind: return 0; } - int afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { @@ -3227,7 +3226,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4655,20 +4654,292 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +static int +find_best_down_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_msg_debug (this->name, 0, "Found best down child (%d) " + "@ %ld ms latency", best_child, best_latency); + } + return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] > worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_msg_debug (this->name, 0, "Found worst up child (%d)" + " @ %ld ms latency", worst_child, worst_latency); + } + return worst_child; +} + +void +__afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, int32_t *event, + int64_t child_latency_msec) +{ + afr_private_t *priv = NULL; + int up_children = 0; + + priv = this->private; + + priv->child_latency[idx] = child_latency_msec; + gf_msg_debug (child_xlator->name, 0, "Client ping @ %ld ms", + child_latency_msec); + + up_children = __afr_get_up_children_count (priv); + + if (child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && + up_children > priv->halo_min_replicas) { + if ((up_children - 1) < + priv->halo_min_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Overriding halo threshold, " + "min replicas: %d", + priv->halo_min_replicas); + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "exceeds halo threshold (%ld), " + "marking child down.", + child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_DOWN; + } + } else if (child_latency_msec < halo_max_latency_msec && + priv->child_up[idx] == 0) { + if (up_children < priv->halo_max_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "below halo threshold (%ld), " + "marking child up.", + child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_UP; + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", idx, + priv->halo_max_replicas); + } + } +} + +void +__afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, + int32_t *event, int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int up_children = 0; + int worst_up_child = -1; + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + up_children = __afr_get_up_children_count (priv); + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > + halo_max_latency_msec) { + gf_msg_debug (this->name, 0, "Marking child %d down, " + "doesn't meet halo threshold (%ld), and > " + "halo_min_replicas (%d)", + worst_up_child, halo_max_latency_msec, + priv->halo_min_replicas); + priv->child_up[worst_up_child] = 0; + up_children--; + } + } + if (up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + up_children--; + gf_msg_debug (this->name, 0, "Marking child %d down, " + "up_children (%d) > halo_max_replicas (%d)", + worst_up_child, up_children, priv->halo_max_replicas); + } + + if (up_children == 1) { + gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_UP; + } + + priv->last_event[idx] = *event; +} + +void +__afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, + int idx, int64_t child_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int i = 0; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to < 0 to indicate + * the child is really disconnected. + */ + if (child_latency_msec < 0) { + priv->child_latency[idx] = child_latency_msec; + } + priv->child_up[idx] = 0; + + up_children = __afr_get_up_children_count (priv); + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + if (up_children < priv->halo_min_replicas) { + best_down_child = find_best_down_child (this); + if (best_down_child >= 0) { + gf_msg_debug (this->name, 0, + "Swapping out child %d for " + "child %d to satisfy halo_min_replicas (%d).", + idx, best_down_child, priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are down. Going " + "offline until atleast one of them " + "comes back up."); + } else { + *event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + priv->last_event[idx] = *event; +} + +static int64_t +afr_get_halo_latency (xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = + priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_msg_debug (this->name, 0, "Using halo latency %ld", + halo_max_latency_msec); + return halo_max_latency_msec; +} + + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) { afr_private_t *priv = NULL; + xlator_t *child_xlator = NULL; int i = -1; - int up_children = 0; - int down_children = 0; int propagate = 0; int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; int call_psh = 0; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; @@ -4677,6 +4948,10 @@ afr_notify (xlator_t *this, int32_t event, struct gf_upcall_cache_invalidation *up_ci = NULL; inode_table_t *itable = NULL; inode_t *inode = NULL; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = -1; + + child_xlator = (xlator_t *)data; priv = this->private; @@ -4701,7 +4976,7 @@ afr_notify (xlator_t *this, int32_t event, * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index (this, data); + idx = find_child_index (this, child_xlator); if (idx < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -4710,6 +4985,30 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency (this); + + if (event == GF_EVENT_CHILD_PING) { + /* Calculates the child latency and sets event + */ + child_latency_msec = (int64_t)(uintptr_t)data2; + LOCK (&priv->lock); + { + __afr_handle_ping_event (this, child_xlator, + idx, halo_max_latency_msec, &event, + child_latency_msec); + } + UNLOCK (&priv->lock); + } + } + + if (event == GF_EVENT_CHILD_PING) { + /* This is the only xlator that handles PING, no reason to + * propagate. + */ + goto out; + } + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { @@ -4736,57 +5035,15 @@ afr_notify (xlator_t *this, int32_t event, propagate = 1; break; case GF_EVENT_CHILD_UP: - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_children = __afr_get_up_children_count (priv); - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - gf_event (EVENT_AFR_SUBVOL_UP, - "subvol=%s", this->name); - - } else { - event = GF_EVENT_SOME_DESCENDENT_UP; - } - - priv->last_event[idx] = event; - + __afr_handle_child_up_event (this, child_xlator, + idx, halo_max_latency_msec, &event, &call_psh, + &up_child); break; case GF_EVENT_CHILD_DOWN: - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - gf_event (EVENT_AFR_SUBVOLS_DOWN, - "subvol=%s", this->name); - } else { - event = GF_EVENT_SOME_DESCENDENT_DOWN; - } - - priv->last_event[idx] = event; - + __afr_handle_child_down_event (this, child_xlator, idx, + child_latency_msec, &event, &call_psh, + &up_child); break; case GF_EVENT_CHILD_CONNECTING: @@ -4839,7 +5096,6 @@ afr_notify (xlator_t *this, int32_t event, had come up, propagate CHILD_UP, but only this time */ event = GF_EVENT_CHILD_DOWN; - up_children = __afr_get_up_children_count (priv); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7f7962013d7..c7d6261b110 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -46,7 +46,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_spbc_timeout_t, gf_afr_mt_spb_status_t, gf_afr_mt_empty_brick_t, - gf_afr_mt_end + gf_afr_mt_child_latency_t, + gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c6ac5ebfd1b..4ac1d32f58a 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -58,6 +58,7 @@ typedef struct { eh_t **statistics; uint32_t max_threads; uint32_t wait_qlength; + uint32_t halo_max_latency_msec; } afr_self_heald_t; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index ceaa034dbbb..17b34822c17 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -184,6 +184,27 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); + GF_OPTION_RECONF ("halo-enabled", + priv->halo_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-shd-max-latency", + priv->shd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, + options, uint32, out); + + GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -473,6 +494,24 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); + + GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, + out); + + GF_OPTION_INIT ("halo-enabled", + priv->halo_enabled, bool, out); + + GF_OPTION_INIT ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, uint32, out); + + GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -528,7 +567,12 @@ init (xlator_t *this) priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up) { + + priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency), + child_count, + gf_afr_mt_child_latency_t); + + if (!priv->child_up || !priv->child_latency) { ret = -ENOMEM; goto out; } @@ -736,7 +780,50 @@ struct volume_options options[] = { "jobs that can perform parallel heals in the " "background." }, - { .key = {"heal-wait-queue-length"}, + { .key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "Maximum latency for shd halo replication in msec." + }, + { .key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable Halo (geo) replication mode." + }, + { .key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for nfsd halo replication in msec." + }, + { .key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for halo replication in msec." + }, + { .key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD." + }, + { .key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .description = "The minimmum number of halo replicas, before adding " + "out of region replicas." + }, + { .key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/ @@ -876,6 +963,13 @@ struct volume_options options[] = { "translator is running as part of self-heal-daemon " "or not." }, + { .key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not." + }, { .key = {"quorum-type"}, .type = GF_OPTION_TYPE_STR, .value = { "none", "auto", "fixed"}, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 0535e7c7271..3be15175dc7 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -74,6 +74,11 @@ typedef enum { AFR_FAV_CHILD_POLICY_MAX, } afr_favorite_child_policy; +struct afr_nfsd { + gf_boolean_t iamnfsd; + uint32_t halo_max_latency_msec; +}; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -85,6 +90,7 @@ typedef struct _afr_private { inode_t *root_inode; unsigned char *child_up; + int64_t *child_latency; unsigned char *local; char **pending_key; @@ -155,8 +161,14 @@ typedef struct _afr_private { gf_boolean_t ensure_durability; char *sh_domain; char *afr_dirty; + gf_boolean_t halo_enabled; + + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; - afr_self_heald_t shd; + afr_self_heald_t shd; + struct afr_nfsd nfsd; gf_boolean_t consistent_metadata; uint64_t spb_choice_timeout; diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c index b9339a770d1..c1139423d6d 100644 --- a/xlators/features/changelog/lib/src/gf-changelog-rpc.c +++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c @@ -26,6 +26,7 @@ gf_changelog_rpc_notify (struct rpc_clnt *rpc, case RPC_CLNT_DISCONNECT: case RPC_CLNT_MSG: case RPC_CLNT_DESTROY: + case RPC_CLNT_PING: break; } diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c index cada369ba0f..caa5bbbadcd 100644 --- a/xlators/features/changelog/src/changelog-ev-handle.c +++ b/xlators/features/changelog/src/changelog-ev-handle.c @@ -179,6 +179,8 @@ changelog_rpc_notify (struct rpc_clnt *rpc, /* Free up mydata */ changelog_rpc_clnt_unref (crpc); break; + case RPC_CLNT_PING: + break; } return 0; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 77b7d55fa29..f0713e6e64a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -5038,6 +5038,24 @@ out: return ret; } +static int +volgen_graph_set_iam_nfsd (const volgen_graph_t *graph) +{ + xlator_t *trav; + int ret = 0; + + for (trav = first_of ((volgen_graph_t *)graph); trav; + trav = trav->next) { + if (strcmp (trav->type, "cluster/replicate") != 0) + continue; + + ret = xlator_set_option (trav, "iam-nfs-daemon", "yes"); + if (ret) + break; + } + return ret; +} + /* builds a graph for nfs server role, with option overrides in mod_dict */ int build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) @@ -5176,6 +5194,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) if (ret) goto out; + ret = volgen_graph_set_iam_nfsd (&cgraph); + if (ret) + goto out; + ret = volgen_graph_merge_sub (graph, &cgraph, 1); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 5d164fd8df3..2dd5a92f310 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -287,6 +287,11 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) int32_t type = 0; char *username = NULL; char *password = NULL; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif GF_ASSERT (req); @@ -395,11 +400,12 @@ __glusterd_handle_create_volume (rpcsvc_request_t *req) if (conf->op_version >= GD_OP_VERSION_3_8_0) { ret = dict_set_dynstr_with_alloc (dict, "transport.address-family", - "inet"); + addr_family); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to set " - "transport.address-family"); + "transport.address-family " + "to %s", addr_family); goto out; } } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 728da74b7a6..8b2ac810e09 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3323,6 +3323,39 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_10_1, .flags = OPT_FLAG_CLIENT_OPT }, + + /* Halo replication options */ + { .key = "cluster.halo-enabled", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-shd-max-latency", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-nfsd-max-latency", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-latency", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-max-replicas", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.halo-min-replicas", + .voltype = "cluster/replicate", + .op_version = GD_OP_VERSION_3_11_0, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = NULL } }; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 6c4cdfed062..b5c90ba1dff 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -186,6 +186,21 @@ start_glusterfs () fi #options with values start here + if [ -n "$halo_max_latency" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-latency=$halo_max_latency"); + fi + + if [ -n "$halo_max_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-max-replicas=$halo_max_replicas"); + fi + + if [ -n "$halo_min_replicas" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-min-replicas=$halo_min_replicas"); + fi + if [ -n "$log_level" ]; then cmd_line=$(echo "$cmd_line --log-level=$log_level"); fi @@ -479,6 +494,15 @@ with_options() [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts," fuse_mountopts="${fuse_mountopts}$key=\"$value\"" ;; + "halo-max-latency") + halo_max_latency=$value + ;; + "halo-max-replicas") + halo_max_replicas=$value + ;; + "halo-min-replicas") + halo_min_replicas=$value + ;; x-*) # comments or userspace application-specific options, drop them ;; diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h index bc9af2f0b8b..0079b9a3deb 100644 --- a/xlators/nfs/server/src/exports.h +++ b/xlators/nfs/server/src/exports.h @@ -22,7 +22,7 @@ #define GF_EXP GF_NFS"-exports" #define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())" -#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/)-]+)" +#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)" #define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)" #define NETGROUP_MAX_LEN 128 diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c index 3f6415dba85..2d022263aca 100644 --- a/xlators/nfs/server/src/mount3.c +++ b/xlators/nfs/server/src/mount3.c @@ -1907,7 +1907,7 @@ _mnt3_get_host_from_peer (const char *peer_addr) size_t host_len = 0; char *colon = NULL; - colon = strchr (peer_addr, ':'); + colon = strrchr (peer_addr, ':'); if (!colon) { gf_msg (GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER, "Bad peer %s", peer_addr); @@ -4152,6 +4152,15 @@ mnt1svc_init (xlator_t *nfsx) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (options, "transport.address-family", "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, + "dict_set_str error when trying to enable ipv6"); + goto err; + } +#endif + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); if (ret == -1) { gf_msg (GF_NFS, GF_LOG_ERROR, errno, diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c index 3e7199c036d..1973b95c5a5 100644 --- a/xlators/nfs/server/src/nfs.c +++ b/xlators/nfs/server/src/nfs.c @@ -204,6 +204,9 @@ nfs_program_register_portmap_all (struct nfs_state *nfs) if (nfs->override_portnum) prog->progport = nfs->override_portnum; (void) rpcsvc_program_register_portmap (prog, prog->progport); +#ifdef IPV6_DEFAULT + (void) rpcsvc_program_register_rpcbind6 (prog, prog->progport); +#endif } return (0); @@ -339,6 +342,17 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this) if (version->required) goto err; } +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_register_rpcbind6 (prog, + prog->progport); + if (ret == -1) { + gf_msg (GF_NFS, GF_LOG_ERROR, 0, + NFS_MSG_PGM_REG_FAIL, + "Program (ipv6) %s registration failed", + prog->progname); + goto err; + } +#endif } } @@ -901,6 +915,16 @@ nfs_init_state (xlator_t *this) } } +#ifdef IPV6_DEFAULT + ret = dict_set_str (this->options, "transport.address-family", + "inet6"); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "dict_set_str error"); + goto free_foppool; + } +#endif + + /* Right only socket support exists between nfs client and * gluster nfs, so we can set default value as socket */ diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index 447366c0deb..a36f6e435a7 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -1553,7 +1553,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi rpc_clnt_reconfig (conf->rpc, &config); conf->skip_notify = 1; - conf->quick_reconnect = 1; + conf->quick_reconnect = 1; out: if (frame) diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index e8db8eed166..6cb5b6b38ca 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -464,7 +464,7 @@ int32_t client_forget (xlator_t *this, inode_t *inode) { /* Nothing here */ - return 0; + return 0; } int32_t @@ -542,7 +542,7 @@ out: STACK_UNWIND_STRICT (lookup, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -568,7 +568,7 @@ out: if (ret) STACK_UNWIND_STRICT (stat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -597,7 +597,7 @@ out: STACK_UNWIND_STRICT (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -625,7 +625,7 @@ out: if (ret) STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -654,7 +654,7 @@ out: if (ret) STACK_UNWIND_STRICT (access, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -684,7 +684,7 @@ out: if (ret) STACK_UNWIND_STRICT (readlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -715,7 +715,7 @@ out: STACK_UNWIND_STRICT (mknod, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -745,7 +745,7 @@ out: STACK_UNWIND_STRICT (mkdir, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -775,7 +775,7 @@ out: STACK_UNWIND_STRICT (unlink, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -804,7 +804,7 @@ out: STACK_UNWIND_STRICT (rmdir, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -834,7 +834,7 @@ out: STACK_UNWIND_STRICT (symlink, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -864,7 +864,7 @@ out: STACK_UNWIND_STRICT (rename, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -894,7 +894,7 @@ out: STACK_UNWIND_STRICT (link, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -929,7 +929,7 @@ out: STACK_UNWIND_STRICT (create, frame, -1, ENOTCONN, NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } @@ -962,7 +962,7 @@ out: if (ret) STACK_UNWIND_STRICT (open, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -997,7 +997,7 @@ out: STACK_UNWIND_STRICT (readv, frame, -1, ENOTCONN, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; } @@ -1035,7 +1035,7 @@ out: if (ret) STACK_UNWIND_STRICT (writev, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1061,7 +1061,7 @@ out: if (ret) STACK_UNWIND_STRICT (flush, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1090,7 +1090,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsync, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } @@ -1117,7 +1117,7 @@ out: if (ret) STACK_UNWIND_STRICT (fstat, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1146,7 +1146,7 @@ out: if (ret) STACK_UNWIND_STRICT (opendir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1174,7 +1174,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1201,7 +1201,7 @@ out: if (ret) STACK_UNWIND_STRICT (statfs, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } static gf_boolean_t @@ -1390,7 +1390,7 @@ out: if (need_unwind) STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); - return 0; + return 0; } @@ -1420,7 +1420,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1450,7 +1450,7 @@ out: if (ret) STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1479,7 +1479,7 @@ out: if (ret) STACK_UNWIND_STRICT (getxattr, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1509,7 +1509,7 @@ out: if (ret) STACK_UNWIND_STRICT (xattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1539,7 +1539,7 @@ out: if (ret) STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1568,7 +1568,7 @@ out: if (ret) STACK_UNWIND_STRICT (removexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1595,7 +1595,7 @@ out: if (ret) STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } int32_t @@ -1651,7 +1651,7 @@ out: if (ret) STACK_UNWIND_STRICT (lk, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1681,7 +1681,7 @@ out: if (ret) STACK_UNWIND_STRICT (inodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1712,7 +1712,7 @@ out: if (ret) STACK_UNWIND_STRICT (finodelk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1744,7 +1744,7 @@ out: if (ret) STACK_UNWIND_STRICT (entrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1777,7 +1777,7 @@ out: if (ret) STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOTCONN, NULL); - return 0; + return 0; } @@ -1806,7 +1806,7 @@ out: if (ret) STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOTCONN, 0, NULL, NULL); - return 0; + return 0; } int32_t @@ -1837,7 +1837,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdir, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1869,7 +1869,7 @@ out: if (ret) STACK_UNWIND_STRICT (readdirp, frame, -1, ENOTCONN, NULL, NULL); - return 0; + return 0; } @@ -1898,7 +1898,7 @@ out: if (ret) STACK_UNWIND_STRICT (setattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -1926,7 +1926,7 @@ out: if (ret) STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOTCONN, NULL, NULL, NULL); - return 0; + return 0; } int32_t @@ -2152,7 +2152,7 @@ out: if (ret) STACK_UNWIND_STRICT (getspec, frame, -1, EINVAL, NULL); - return 0; + return 0; } @@ -2223,6 +2223,15 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf = this->private; switch (event) { + case RPC_CLNT_PING: + { + ret = default_notify (this, GF_EVENT_CHILD_PING, data); + if (ret) + gf_log (this->name, GF_LOG_INFO, + "CHILD_PING notify failed"); + conf->last_sent_event = GF_EVENT_CHILD_PING; + break; + } case RPC_CLNT_CONNECT: { conf->connected = 1; @@ -2314,7 +2323,6 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, } else { rpc->conn.config.remote_port = 0; - } break; @@ -2666,7 +2674,7 @@ reconfigure (xlator_t *this, dict_t *options) ret = 0; out: - return ret; + return ret; } |