diff options
author | Kevin Vigor <kvigor@fb.com> | 2017-03-21 08:23:25 -0700 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2017-05-02 10:23:53 +0000 |
commit | 07cc8679cdf3b29680f4f105d0222da168d8bfc1 (patch) | |
tree | da838a4f8afc045253300604e2536f97c533e41d /rpc | |
parent | 9374338f9c2f126c6608625f750d5ea1f7ef6a06 (diff) |
Halo Replication feature for AFR translator
Summary:
Halo Geo-replication is a feature which allows Gluster or NFS clients to write
locally to their region (as defined by a latency "halo" or threshold if you
like), and have their writes asynchronously propagate from their origin to the
rest of the cluster. Clients can also write synchronously to the cluster
simply by specifying a halo-latency which is very large (e.g. 10seconds) which
will include all bricks.
In other words, it allows clients to decide at mount time if they desire
synchronous or asynchronous IO into a cluster and the cluster can support both
of these modes to any number of clients simultaneously.
There are a few new volume options due to this feature:
halo-shd-latency: The threshold below which self-heal daemons will
consider children (bricks) connected.
halo-nfsd-latency: The threshold below which NFS daemons will consider
children (bricks) connected.
halo-latency: The threshold below which all other clients will
consider children (bricks) connected.
halo-min-replicas: The minimum number of replicas which are to
be enforced regardless of latency specified in the above 3 options.
If the number of children falls below this threshold the next
best (chosen by latency) shall be swapped in.
New FUSE mount options:
halo-latency & halo-min-replicas: As descripted above.
This feature combined with multi-threaded SHD support (D1271745) results in
some pretty cool geo-replication possibilities.
Operational Notes:
- Global consistency is gaurenteed for synchronous clients, this is provided by
the existing entry-locking mechanism.
- Asynchronous clients on the other hand and merely consistent to their region.
Writes & deletes will be protected via entry-locks as usual preventing
concurrent writes into files which are undergoing replication. Read operations
on the other hand should never block.
- Writes are allowed from _any_ region and propagated from the origin to all
other regions. The take away from this is care should be taken to ensure
multiple writers do not write the same files resulting in a gfid split-brain
which will require resolution via split-brain policies (majority, mtime &
size). Recommended method for preventing this is using the nfs-auth feature to
define which region for each share has RW permissions, tiers not in the origin
region should have RO perms.
TODO:
- Synchronous clients (including the SHD) should choose clients from their own
region as preferred sources for reads. Most of the plumbing is in place for
this via the child_latency array.
- Better GFID split brain handling & better dent type split brain handling
(i.e. create a trash can and move the offending files into it).
- Tagging in addition to latency as a means of defining which children you wish
to synchronously write to
Test Plan:
- The usual suspects, clang, gcc w/ address sanitizer & valgrind
- Prove tests
Reviewers: jackl, dph, cjh, meyering
Reviewed By: meyering
Subscribers: ethanr
Differential Revision: https://phabricator.fb.com/D1272053
Tasks: 4117827
Change-Id: I694a9ab429722da538da171ec528406e77b5e6d1
BUG: 1428061
Signed-off-by: Kevin Vigor <kvigor@fb.com>
Reviewed-on: http://review.gluster.org/16099
Reviewed-on: https://review.gluster.org/16177
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'rpc')
-rw-r--r-- | rpc/rpc-lib/src/rpc-clnt-ping.c | 65 | ||||
-rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.h | 1 | ||||
-rw-r--r-- | rpc/rpc-lib/src/rpc-transport.c | 12 | ||||
-rw-r--r-- | rpc/rpc-lib/src/rpcsvc.c | 89 | ||||
-rw-r--r-- | rpc/rpc-lib/src/rpcsvc.h | 5 | ||||
-rw-r--r-- | rpc/rpc-transport/socket/src/name.c | 13 | ||||
-rw-r--r-- | rpc/rpc-transport/socket/src/socket.c | 15 | ||||
-rw-r--r-- | rpc/xdr/src/glusterfs-fops.x | 1 |
8 files changed, 187 insertions, 14 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index e042121ad47..5a97f4bb9cf 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -18,6 +18,7 @@ #include "mem-pool.h" #include "xdr-rpc.h" #include "rpc-common-xdr.h" +#include "timespec.h" char *clnt_ping_procs[GF_DUMP_MAXVALUE] = { @@ -30,6 +31,11 @@ struct rpc_clnt_program clnt_ping_prog = { .procnames = clnt_ping_procs, }; +struct ping_local { + struct rpc_clnt *rpc; + struct timespec submit_time; +}; + /* Must be called under conn->lock */ static int __rpc_clnt_rearm_ping_timer (struct rpc_clnt *rpc, gf_timer_cbk_t cbk) @@ -170,11 +176,17 @@ int rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { - struct rpc_clnt *rpc = NULL; + struct ping_local *local = NULL; xlator_t *this = NULL; rpc_clnt_connection_t *conn = NULL; call_frame_t *frame = NULL; int unref = 0; + gf_boolean_t call_notify = _gf_false; + + struct timespec now; + struct timespec delta; + int64_t latency_msec = 0; + int ret = 0; if (!myframe) { gf_log (THIS->name, GF_LOG_WARNING, @@ -184,14 +196,23 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, frame = myframe; this = frame->this; - rpc = frame->local; - frame->local = NULL; /* Prevent STACK_DESTROY from segfaulting */ - conn = &rpc->conn; + local = frame->local; + conn = &local->rpc->conn; + + timespec_now (&now); + timespec_sub (&local->submit_time, &now, &delta); + latency_msec = delta.tv_sec * 1000 + delta.tv_nsec / 1000000; pthread_mutex_lock (&conn->lock); { + this->client_latency = latency_msec; + gf_log (THIS->name, GF_LOG_DEBUG, + "Ping latency is %" PRIu64 "ms", + latency_msec); + + call_notify = _gf_true; if (req->rpc_status == -1) { - unref = rpc_clnt_remove_ping_timer_locked (rpc); + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (unref) { gf_log (this->name, GF_LOG_WARNING, "socket or ib related error"); @@ -206,8 +227,8 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, goto unlock; } - unref = rpc_clnt_remove_ping_timer_locked (rpc); - if (__rpc_clnt_rearm_ping_timer (rpc, + unref = rpc_clnt_remove_ping_timer_locked (local->rpc); + if (__rpc_clnt_rearm_ping_timer (local->rpc, rpc_clnt_start_ping) == -1) { gf_log (this->name, GF_LOG_WARNING, "failed to set the ping timer"); @@ -216,12 +237,24 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, } unlock: pthread_mutex_unlock (&conn->lock); + + if (call_notify) { + ret = local->rpc->notifyfn (local->rpc, this, RPC_CLNT_PING, + (void *)(uintptr_t)latency_msec); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "RPC_CLNT_PING notify failed"); + } + } out: if (unref) - rpc_clnt_unref (rpc); + rpc_clnt_unref (local->rpc); - if (frame) + if (frame) { + GF_FREE (frame->local); + frame->local = NULL; STACK_DESTROY (frame->root); + } return 0; } @@ -231,18 +264,28 @@ rpc_clnt_ping (struct rpc_clnt *rpc) call_frame_t *frame = NULL; int32_t ret = -1; rpc_clnt_connection_t *conn = NULL; + struct ping_local *local = NULL; conn = &rpc->conn; + local = GF_CALLOC (1, sizeof(struct ping_local), + gf_common_ping_local_t); + if (!local) + return ret; frame = create_frame (THIS, THIS->ctx->pool); - if (!frame) + if (!frame) { + GF_FREE (local); return ret; + } - frame->local = rpc; + local->rpc = rpc; + timespec_now (&local->submit_time); + frame->local = local; ret = rpc_clnt_submit (rpc, &clnt_ping_prog, GF_DUMP_PING, rpc_clnt_ping_cbk, NULL, 0, NULL, 0, NULL, frame, NULL, 0, NULL, 0, NULL); if (ret) { + /* FIXME: should we free the frame here? Methinks so! */ gf_log (THIS->name, GF_LOG_ERROR, "failed to start ping timer"); } diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index b731ba2dfad..e5e493cd79e 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -19,6 +19,7 @@ typedef enum { RPC_CLNT_CONNECT, RPC_CLNT_DISCONNECT, + RPC_CLNT_PING, RPC_CLNT_MSG, RPC_CLNT_DESTROY } rpc_clnt_event_t; diff --git a/rpc/rpc-lib/src/rpc-transport.c b/rpc/rpc-lib/src/rpc-transport.c index 4fbbc8cd7fc..4c3d5279fe1 100644 --- a/rpc/rpc-lib/src/rpc-transport.c +++ b/rpc/rpc-lib/src/rpc-transport.c @@ -656,6 +656,11 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, dict_t *dict = NULL; char *host = NULL; int ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; +#else + char *addr_family = "inet"; +#endif GF_ASSERT (options); GF_ASSERT (hostname); @@ -686,6 +691,13 @@ rpc_transport_inet_options_build (dict_t **options, const char *hostname, goto out; } + ret = dict_set_str (dict, "address-family", addr_family); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set address-family to %s", addr_family); + goto out; + } + ret = dict_set_str (dict, "transport-type", "socket"); if (ret) { gf_log (THIS->name, GF_LOG_WARNING, diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c index 2be54a382f9..4cd2be17085 100644 --- a/rpc/rpc-lib/src/rpcsvc.c +++ b/rpc/rpc-lib/src/rpcsvc.c @@ -37,6 +37,10 @@ #include <stdarg.h> #include <stdio.h> +#ifdef IPV6_DEFAULT +#include <netconfig.h> +#endif + #include "xdr-rpcclnt.h" #include "glusterfs-acl.h" @@ -1386,6 +1390,82 @@ rpcsvc_error_reply (rpcsvc_request_t *req) return rpcsvc_submit_generic (req, &dummyvec, 0, NULL, 0, NULL); } +#ifdef IPV6_DEFAULT +int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port) +{ + const int IP_BUF_LEN = 64; + char addr_buf[IP_BUF_LEN]; + + int err = 0; + bool_t success = 0; + struct netconfig *nc; + struct netbuf *nb; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + + err = sprintf (addr_buf, "::.%d.%d", port >> 8 & 0xff, + port & 0xff); + if (err < 0) { + err = -1; + goto out; + } + + nb = uaddr2taddr (nc, addr_buf); + if (!nb) { + err = -1; + goto out; + } + + success = rpcb_set (newprog->prognum, newprog->progver, nc, nb); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not register the IPv6" + " service with rpcbind"); + } + + err = 0; + +out: + return err; +} + +int +rpcsvc_program_unregister_rpcbind6 (rpcsvc_program_t *newprog) +{ + int err = 0; + bool_t success = 0; + struct netconfig *nc; + + if (!newprog) { + goto out; + } + + nc = getnetconfigent ("tcp6"); + if (!nc) { + err = -1; + goto out; + } + + success = rpcb_unset (newprog->prognum, newprog->progver, nc); + if (!success) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "Could not unregister the IPv6" + " service with rpcbind"); + } + + err = 0; +out: + return err; +} +#endif /* Register the program with the local portmapper service. */ int @@ -1550,7 +1630,14 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) " program failed"); goto out; } - +#ifdef IPV6_DEFAULT + ret = rpcsvc_program_unregister_rpcbind6 (program); + if (ret == -1) { + gf_log (GF_RPCSVC, GF_LOG_ERROR, "rpcbind (ipv6)" + " unregistration of program failed"); + goto out; + } +#endif pthread_mutex_lock (&svc->rpclock); { list_for_each_entry (prog, &svc->programs, program) { diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h index cf3e5906de1..8097a525dab 100644 --- a/rpc/rpc-lib/src/rpcsvc.h +++ b/rpc/rpc-lib/src/rpcsvc.h @@ -434,6 +434,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener); extern int rpcsvc_program_register_portmap (rpcsvc_program_t *newprog, uint32_t port); +#ifdef IPV6_DEFAULT +extern int +rpcsvc_program_register_rpcbind6 (rpcsvc_program_t *newprog, uint32_t port); +#endif + extern int rpcsvc_program_unregister_portmap (rpcsvc_program_t *newprog); diff --git a/rpc/rpc-transport/socket/src/name.c b/rpc/rpc-transport/socket/src/name.c index e9de1a78813..56e9bb8d31b 100644 --- a/rpc/rpc-transport/socket/src/name.c +++ b/rpc/rpc-transport/socket/src/name.c @@ -562,6 +562,14 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) data_t *address_family_data = NULL; int32_t ret = -1; +#ifdef IPV6_DEFAULT + char *addr_family = "inet6"; + sa_family_t default_family = AF_INET6; +#else + char *addr_family = "inet"; + sa_family_t default_family = AF_INET; +#endif + GF_VALIDATE_OR_GOTO ("socket", sa_family, out); address_family_data = dict_get (this->options, @@ -586,8 +594,9 @@ server_fill_address_family (rpc_transport_t *this, sa_family_t *sa_family) } } else { gf_log (this->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting to inet"); - *sa_family = AF_INET; + "option address-family not specified, " + "defaulting to %s", addr_family); + *sa_family = default_family; } ret = 0; diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index e6e73bf8f22..051022290fa 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -3101,6 +3101,21 @@ socket_connect (rpc_transport_t *this, int port) } } + /* Make sure we are not vulnerable to someone setting + * net.ipv6.bindv6only to 1 so that gluster services are + * avalable over IPv4 & IPv6. + */ + int disable_v6only = 0; + + if (setsockopt (priv->sock, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&disable_v6only, + sizeof (disable_v6only)) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Error disabling sockopt IPV6_V6ONLY: \"%s\"", + strerror (errno)); + } + + if (priv->nodelay && (sa_family != AF_UNIX)) { ret = __socket_nodelay (priv->sock); diff --git a/rpc/xdr/src/glusterfs-fops.x b/rpc/xdr/src/glusterfs-fops.x index 7b0bcb33213..5b7fe001f1c 100644 --- a/rpc/xdr/src/glusterfs-fops.x +++ b/rpc/xdr/src/glusterfs-fops.x @@ -103,6 +103,7 @@ enum glusterfs_event_t { GF_EVENT_SOME_DESCENDENT_DOWN, GF_EVENT_SCRUB_ONDEMAND, GF_EVENT_SOME_DESCENDENT_UP, + GF_EVENT_CHILD_PING, GF_EVENT_MAXVAL }; |