diff options
author | Xavier Hernandez <jahernan@redhat.com> | 2018-01-19 12:18:13 +0100 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2019-05-11 14:25:53 +0000 |
commit | 59841f7e1ff0511b04884015441a181a56d07bea (patch) | |
tree | 7b4f16752014cf0cfc0ba1aad1847a43325e28a9 /rpc/rpc-lib/src | |
parent | da4601d536da761ce908a2461a0930857f99f171 (diff) |
rpc: implement reconnect back-off strategy
When a connection failure happens, gluster tries to reconnect every 3
seconds. In some cases the failure is spurious, so a delay of 3 seconds
could be unnecessarily long.
This patch implements a back-off strategy that tries a reconnect as soon
as 1 tenth of a second. If this fails, the time is doubled until it's
around 3 seconds. After that, the reconnect is attempted every 3 seconds
as before.
Change-Id: Icb3fbe20d618f50cbbb599dce542b4e871c22149
Updates: bz#1193929
Signed-off-by: Xavier Hernandez <xhernandez@redhat.com>
Diffstat (limited to 'rpc/rpc-lib/src')
-rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.c | 33 | ||||
-rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.h | 1 |
2 files changed, 18 insertions, 16 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index 8ef05378351..c1945dfb6ec 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -392,8 +392,16 @@ rpc_clnt_reconnect(void *conn_ptr) conn->reconnect = 0; if ((conn->connected == 0) && !clnt->disabled) { - ts.tv_sec = 3; - ts.tv_nsec = 0; + if (conn->reconnect_delay.tv_sec < 3) { + conn->reconnect_delay.tv_sec *= 2; + int64_t ns = conn->reconnect_delay.tv_nsec * 2; + if (ns >= 1000000000ULL) { + conn->reconnect_delay.tv_sec++; + ns -= 1000000000ULL; + } + conn->reconnect_delay.tv_nsec = ns; + } + ts = conn->reconnect_delay; gf_log(conn->name, GF_LOG_TRACE, "attempting reconnect"); (void)rpc_transport_connect(trans, conn->config.remote_port); @@ -838,9 +846,11 @@ rpc_clnt_handle_disconnect(struct rpc_clnt *clnt, rpc_clnt_connection_t *conn) pthread_mutex_lock(&conn->lock); { + conn->reconnect_delay.tv_sec = 0; + conn->reconnect_delay.tv_nsec = 100000000; + if (!conn->rpc_clnt->disabled && (conn->reconnect == NULL)) { - ts.tv_sec = 3; - ts.tv_nsec = 0; + ts = conn->reconnect_delay; rpc_clnt_ref(clnt); conn->reconnect = gf_timer_call_after(clnt->ctx, ts, @@ -1160,6 +1170,8 @@ rpc_clnt_start(struct rpc_clnt *rpc) * rpc_clnt_reconnect fire event. */ rpc_clnt_ref(rpc); + conn->reconnect_delay.tv_sec = 0; + conn->reconnect_delay.tv_nsec = 50000000; rpc_clnt_reconnect(conn); return 0; @@ -1177,18 +1189,7 @@ rpc_clnt_cleanup_and_start(struct rpc_clnt *rpc) rpc_clnt_connection_cleanup(conn); - pthread_mutex_lock(&conn->lock); - { - rpc->disabled = 0; - } - pthread_mutex_unlock(&conn->lock); - /* Corresponding unref will be either on successful timer cancel or last - * rpc_clnt_reconnect fire event. - */ - rpc_clnt_ref(rpc); - rpc_clnt_reconnect(conn); - - return 0; + return rpc_clnt_start(rpc); } int diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index b46feed50c8..2c252d5ff86 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -136,6 +136,7 @@ struct rpc_clnt_connection { struct saved_frames *saved_frames; struct timespec last_sent; struct timespec last_received; + struct timespec reconnect_delay; uint64_t pingcnt; uint64_t msgcnt; uint64_t cleanup_gen; |