From 2c6abb9d9781fa5dda216e78d78d0b9a9ab17d35 Mon Sep 17 00:00:00 2001 From: Kevin Vigor Date: Mon, 27 Jun 2016 10:39:01 -0700 Subject: protocol/client: Fix race in brick reconnection Summary: - A race condition exists when reconnecting to a brick after connection has been lost; it is possible for the client translator to believe the connection is down while the socket layer believes the connection is up. This situation is permanent and eventually leads to loss of quorum and EROFS errors. - This is a cherry-pick of D3490020 to 3.8 Signed-off-by: Shreyas Siravara Change-Id: Ida7afbafd3dceadf9ca7ea8b350aa88db382dd88 Reviewed-on: http://review.gluster.org/16174 Reviewed-by: Kevin Vigor Tested-by: Shreyas Siravara NetBSD-regression: NetBSD Build System Smoke: Gluster Build System CentOS-regression: Gluster Build System --- xlators/protocol/client/src/client.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 0647f1035ba..55c12e8e7ca 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -2321,12 +2321,30 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf->connected = 0; conf->skip_notify = 0; - if (conf->quick_reconnect) { - conf->quick_reconnect = 0; - rpc_clnt_start (rpc); - - } else { + if (conf->rpc->conn.connected) { + /* Having conf->connected false and + * conf->rpc->conn.connected true is an + * unrecoverable state, since rpc_clnt_reconnect + * will do nothing for an already connected connection. + * A good fix would be to ensure serialized + * delivery of transport messages, but that is super hard + * and this is rare. So... ghetto "fix", disconnect the + * RPC and start the race again. Maybe we'll win + * next time! + */ + gf_log (this->name, GF_LOG_WARNING, + "Client %s reconnect race detected, " + "restarting.", conf->rpc->conn.name); + conf->quick_reconnect = 1; + rpc_transport_disconnect (rpc->conn.trans); rpc->conn.config.remote_port = 0; + } else { + if (conf->quick_reconnect) { + conf->quick_reconnect = 0; + rpc_clnt_start (rpc); + } else { + rpc->conn.config.remote_port = 0; + } } break; -- cgit