summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKevin Vigor <kvigor@fb.com>2016-06-27 10:39:01 -0700
committerShreyas Siravara <sshreyas@fb.com>2016-12-16 21:46:54 -0800
commit2c6abb9d9781fa5dda216e78d78d0b9a9ab17d35 (patch)
tree3255c414bc2da0ba418abb090cf402b0f1cf7fa0
parent81b671693c0015dd9f7acf818488118dbd61a6b0 (diff)
protocol/client: Fix race in brick reconnection
Summary: - A race condition exists when reconnecting to a brick after connection has been lost; it is possible for the client translator to believe the connection is down while the socket layer believes the connection is up. This situation is permanent and eventually leads to loss of quorum and EROFS errors. - This is a cherry-pick of D3490020 to 3.8 Signed-off-by: Shreyas Siravara <sshreyas@fb.com> Change-Id: Ida7afbafd3dceadf9ca7ea8b350aa88db382dd88 Reviewed-on: http://review.gluster.org/16174 Reviewed-by: Kevin Vigor <kvigor@fb.com> Tested-by: Shreyas Siravara <sshreyas@fb.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--xlators/protocol/client/src/client.c28
1 files changed, 23 insertions, 5 deletions
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 0647f1035ba..55c12e8e7ca 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -2321,12 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf->connected = 0;
conf->skip_notify = 0;
- if (conf->quick_reconnect) {
- conf->quick_reconnect = 0;
- rpc_clnt_start (rpc);
-
- } else {
+ if (conf->rpc->conn.connected) {
+ /* Having conf->connected false and
+ * conf->rpc->conn.connected true is an
+ * unrecoverable state, since rpc_clnt_reconnect
+ * will do nothing for an already connected connection.
+ * A good fix would be to ensure serialized
+ * delivery of transport messages, but that is super hard
+ * and this is rare. So... ghetto "fix", disconnect the
+ * RPC and start the race again. Maybe we'll win
+ * next time!
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "Client %s reconnect race detected, "
+ "restarting.", conf->rpc->conn.name);
+ conf->quick_reconnect = 1;
+ rpc_transport_disconnect (rpc->conn.trans);
rpc->conn.config.remote_port = 0;
+ } else {
+ if (conf->quick_reconnect) {
+ conf->quick_reconnect = 0;
+ rpc_clnt_start (rpc);
+ } else {
+ rpc->conn.config.remote_port = 0;
+ }
}
break;