diff options
author | Vijaikumar M <vmallika@redhat.com> | 2014-05-23 14:42:08 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2014-06-05 10:11:45 -0700 |
commit | 42b956971c47fd0708cbbd17ce8c78c2ed79bfba (patch) | |
tree | c8056e730ff7277a6a323aad85ac8874862e4099 | |
parent | a89e35727e3a9a7226c7a16479935b7109b11663 (diff) |
glusterd: Handle rpc_connect failure in the event handler
Currently rpc_connect calls the notification function on failure in the
same thread, glusterd notification holds the big_lock and
hence big_lock is released before rpc_connect
In snapshot creation, releasing the big-lock before completeing
operation can cause problem like deadlock or memory corruption.
Bricks are started as part of snapshot created operation.
brick_start releases the big_lock when doing brick_connect and this
might cause glusterd crash.
There is a similar issue in bug# 1088355.
Solution is let the event handler handle the failure than doing it in
the rpc_connect.
Change-Id: I088d44092ce845a07516c1d67abd02b220e08b38
BUG: 1101507
Signed-off-by: Vijaikumar M <vmallika@redhat.com>
Reviewed-on: http://review.gluster.org/7843
Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Tested-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.c | 4 | ||||
-rw-r--r-- | rpc/rpc-transport/socket/src/Makefile.am | 2 | ||||
-rw-r--r-- | rpc/rpc-transport/socket/src/socket-mem-types.h | 21 | ||||
-rw-r--r-- | rpc/rpc-transport/socket/src/socket.c | 236 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-handler.c | 2 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-rebalance.c | 2 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-snapshot.c | 4 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 4 |
8 files changed, 169 insertions, 106 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index 44324a80431..8a460cfa617 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -429,10 +429,6 @@ rpc_clnt_reconnect (void *conn_ptr) } pthread_mutex_unlock (&conn->lock); - if ((ret == -1) && (errno != EINPROGRESS) && (clnt->notifyfn)) { - clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL); - } - return; } diff --git a/rpc/rpc-transport/socket/src/Makefile.am b/rpc/rpc-transport/socket/src/Makefile.am index 71e6ed6ff4a..5e909aceac8 100644 --- a/rpc/rpc-transport/socket/src/Makefile.am +++ b/rpc/rpc-transport/socket/src/Makefile.am @@ -1,4 +1,4 @@ -noinst_HEADERS = socket.h name.h +noinst_HEADERS = socket.h name.h socket-mem-types.h rpctransport_LTLIBRARIES = socket.la rpctransportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/rpc-transport diff --git a/rpc/rpc-transport/socket/src/socket-mem-types.h b/rpc/rpc-transport/socket/src/socket-mem-types.h new file mode 100644 index 00000000000..e5553b172a2 --- /dev/null +++ b/rpc/rpc-transport/socket/src/socket-mem-types.h @@ -0,0 +1,21 @@ +/* + Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __SOCKET_MEM_TYPES_H__ +#define __SOCKET_MEM_TYPES_H__ + +#include "mem-types.h" + +typedef enum gf_sock_mem_types_ { + gf_sock_connect_error_state_t = gf_common_mt_end + 1, + gf_sock_mt_end +} gf_sock_mem_types_t; + +#endif diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c index 6f566e49345..6d4a862aa8d 100644 --- a/rpc/rpc-transport/socket/src/socket.c +++ b/rpc/rpc-transport/socket/src/socket.c @@ -23,7 +23,7 @@ #include "byte-order.h" #include "common-utils.h" #include "compat-errno.h" - +#include "socket-mem-types.h" /* ugly #includes below */ #include "protocol-common.h" @@ -152,6 +152,13 @@ typedef int SSL_trinary_func (SSL *, void *, int); __socket_proto_update_priv_after_read (priv, ret, bytes_read); \ } +struct socket_connect_error_state_ { + xlator_t *this; + rpc_transport_t *trans; + gf_boolean_t refd; +}; +typedef struct socket_connect_error_state_ socket_connect_error_state_t; + static int socket_init (rpc_transport_t *this); static void @@ -2652,19 +2659,41 @@ out: return ret; } +void* +socket_connect_error_cbk (void *opaque) +{ + socket_connect_error_state_t *arg; + + GF_ASSERT (opaque); + + arg = opaque; + THIS = arg->this; + + rpc_transport_notify (arg->trans, RPC_TRANSPORT_DISCONNECT, arg->trans); + + if (arg->refd) + rpc_transport_unref (arg->trans); + + GF_FREE (opaque); + return NULL; +} static int socket_connect (rpc_transport_t *this, int port) { - int ret = -1; - int sock = -1; - socket_private_t *priv = NULL; - socklen_t sockaddr_len = 0; - glusterfs_ctx_t *ctx = NULL; - sa_family_t sa_family = {0, }; - char *local_addr = NULL; - union gf_sock_union sock_union; - struct sockaddr_in *addr = NULL; + int ret = -1; + int th_ret = -1; + int sock = -1; + socket_private_t *priv = NULL; + socklen_t sockaddr_len = 0; + glusterfs_ctx_t *ctx = NULL; + sa_family_t sa_family = {0, }; + char *local_addr = NULL; + union gf_sock_union sock_union; + struct sockaddr_in *addr = NULL; + gf_boolean_t refd = _gf_false; + socket_connect_error_state_t *arg = NULL; + pthread_t th_id = {0, }; GF_VALIDATE_OR_GOTO ("socket", this, err); GF_VALIDATE_OR_GOTO ("socket", this->private, err); @@ -2680,52 +2709,43 @@ socket_connect (rpc_transport_t *this, int port) pthread_mutex_lock (&priv->lock); { - sock = priv->sock; - } - pthread_mutex_unlock (&priv->lock); - - if (sock != -1) { - gf_log_callingfn (this->name, GF_LOG_TRACE, - "connect () called on transport already connected"); - errno = EINPROGRESS; - ret = -1; - goto err; - } + if (priv->sock != -1) { + gf_log_callingfn (this->name, GF_LOG_TRACE, + "connect () called on transport " + "already connected"); + errno = EINPROGRESS; + ret = -1; + goto unlock; + } - gf_log (this->name, GF_LOG_TRACE, - "connecting %p, state=%u gen=%u sock=%d", this, - priv->ot_state, priv->ot_gen, priv->sock); + gf_log (this->name, GF_LOG_TRACE, + "connecting %p, state=%u gen=%u sock=%d", this, + priv->ot_state, priv->ot_gen, priv->sock); - ret = socket_client_get_remote_sockaddr (this, &sock_union.sa, - &sockaddr_len, &sa_family); - if (ret == -1) { - /* logged inside client_get_remote_sockaddr */ - goto err; - } + ret = socket_client_get_remote_sockaddr (this, &sock_union.sa, + &sockaddr_len, &sa_family); + if (ret == -1) { + /* logged inside client_get_remote_sockaddr */ + goto unlock; + } - if (port > 0) { - sock_union.sin.sin_port = htons (port); - } - if (ntohs(sock_union.sin.sin_port) == GF_DEFAULT_SOCKET_LISTEN_PORT) { - if (priv->use_ssl) { - gf_log(this->name,GF_LOG_DEBUG, - "disabling SSL for portmapper connection"); - priv->use_ssl = _gf_false; + if (port > 0) { + sock_union.sin.sin_port = htons (port); } - } - else { - if (priv->ssl_enabled && !priv->use_ssl) { - gf_log(this->name,GF_LOG_DEBUG, - "re-enabling SSL for I/O connection"); - priv->use_ssl = _gf_true; + if (ntohs(sock_union.sin.sin_port) == + GF_DEFAULT_SOCKET_LISTEN_PORT) { + if (priv->use_ssl) { + gf_log(this->name,GF_LOG_DEBUG, + "disabling SSL for portmapper connection"); + priv->use_ssl = _gf_false; + } } - } - pthread_mutex_lock (&priv->lock); - { - if (priv->sock != -1) { - gf_log (this->name, GF_LOG_TRACE, - "connect() -- already connected"); - goto unlock; + else { + if (priv->ssl_enabled && !priv->use_ssl) { + gf_log(this->name,GF_LOG_DEBUG, + "re-enabling SSL for I/O connection"); + priv->use_ssl = _gf_true; + } } memcpy (&this->peerinfo.sockaddr, &sock_union.storage, @@ -2737,6 +2757,7 @@ socket_connect (rpc_transport_t *this, int port) gf_log (this->name, GF_LOG_ERROR, "socket creation failed (%s)", strerror (errno)); + ret = -1; goto unlock; } @@ -2795,7 +2816,8 @@ socket_connect (rpc_transport_t *this, int port) &local_addr); if (!ret && SA (&this->myinfo.sockaddr)->sa_family == AF_INET) { addr = (struct sockaddr_in *)(&this->myinfo.sockaddr); - ret = inet_pton (AF_INET, local_addr, &(addr->sin_addr.s_addr)); + ret = inet_pton (AF_INET, local_addr, + &(addr->sin_addr.s_addr)); } ret = client_bind (this, SA (&this->myinfo.sockaddr), @@ -2803,9 +2825,7 @@ socket_connect (rpc_transport_t *this, int port) if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "client bind failed: %s", strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; + goto handler; } if (!priv->use_ssl && !priv->bio && !priv->own_thread) { @@ -2814,9 +2834,7 @@ socket_connect (rpc_transport_t *this, int port) gf_log (this->name, GF_LOG_ERROR, "NBIO on %d failed (%s)", priv->sock, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; + goto handler; } } @@ -2832,21 +2850,20 @@ socket_connect (rpc_transport_t *this, int port) GF_LOG_DEBUG : GF_LOG_ERROR), "connection attempt on %s failed, (%s)", this->peerinfo.identifier, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; + goto handler; + } + else { + ret = 0; } - if (priv->use_ssl && !priv->own_thread) { - ret = ssl_setup_connection(this,0); - if (ret < 0) { - gf_log(this->name,GF_LOG_ERROR, - "client setup failed"); - close(priv->sock); - priv->sock = -1; - goto unlock; - } - } + if (priv->use_ssl && !priv->own_thread) { + ret = ssl_setup_connection(this,0); + if (ret < 0) { + gf_log(this->name,GF_LOG_ERROR, + "client setup failed"); + goto handler; + } + } if (!priv->bio && !priv->own_thread) { ret = __socket_nonblock (priv->sock); @@ -2855,10 +2872,24 @@ socket_connect (rpc_transport_t *this, int port) gf_log (this->name, GF_LOG_ERROR, "NBIO on %d failed (%s)", priv->sock, strerror (errno)); - close (priv->sock); + goto handler; + } + } + +handler: + if (ret < 0) { + if (priv->own_thread) { + close(priv->sock); priv->sock = -1; goto unlock; } + else { + /* Ignore error from connect. epoll events + should be handled in the socket handler. + shutdown(2) will result in EPOLLERR, so + cleanup is done in socket_event_handler */ + shutdown (priv->sock, SHUT_RDWR); + } } /* @@ -2868,31 +2899,58 @@ socket_connect (rpc_transport_t *this, int port) priv->connected = 0; priv->is_server = _gf_false; rpc_transport_ref (this); + refd = _gf_true; - if (priv->own_thread) { - if (pipe(priv->pipe) < 0) { - gf_log(this->name,GF_LOG_ERROR, - "could not create pipe"); - } + if (priv->own_thread) { + if (pipe(priv->pipe) < 0) { + gf_log(this->name,GF_LOG_ERROR, + "could not create pipe"); + } this->listener = this; socket_spawn(this); - } - else { - priv->idx = event_register (ctx->event_pool, priv->sock, - socket_event_handler, - this, 1, 1); - if (priv->idx == -1) { - gf_log ("", GF_LOG_WARNING, - "failed to register the event"); - ret = -1; - } - } - } + } + else { + priv->idx = event_register (ctx->event_pool, priv->sock, + socket_event_handler, + this, 1, 1); + if (priv->idx == -1) { + gf_log ("", GF_LOG_WARNING, + "failed to register the event"); + close(priv->sock); + priv->sock = -1; + ret = -1; + } + } + unlock: + sock = priv->sock; + } pthread_mutex_unlock (&priv->lock); err: + /* if sock != -1, then cleanup is done from the event handler */ + if (ret == -1 && sock == -1) { + /* Cleaup requires to send notification to upper layer which + intern holds the big_lock. There can be dead-lock situation + if big_lock is already held by the current thread. + So transfer the ownership to seperate thread for cleanup. + */ + arg = GF_CALLOC (1, sizeof (*arg), + gf_sock_connect_error_state_t); + arg->this = THIS; + arg->trans = this; + arg->refd = refd; + th_ret = pthread_create (&th_id, NULL, socket_connect_error_cbk, + arg); + if (th_ret) { + gf_log (this->name, GF_LOG_ERROR, "pthread_create" + "failed: %s", strerror(errno)); + GF_FREE (arg); + GF_ASSERT (0); + } + } + return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index 733bea236ed..e6b744af536 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -3135,9 +3135,7 @@ glusterd_friend_add (const char *hoststr, int port, if (!restore) { ret = glusterd_store_peerinfo (*friend); if (ret == 0) { - synclock_unlock (&conf->big_lock); ret = glusterd_friend_rpc_create (this, *friend, args); - synclock_lock (&conf->big_lock); } else { gf_log (this->name, GF_LOG_ERROR, diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index cf2272028c6..f2e7338a80e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -374,10 +374,8 @@ glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo, } glusterd_volinfo_ref (volinfo); - synclock_unlock (&priv->big_lock); ret = glusterd_rpc_create (&defrag->rpc, options, glusterd_defrag_notify, volinfo); - synclock_lock (&priv->big_lock); if (ret) { gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL, "Glusterd RPC creation failed"); diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c index eef6129745a..7d3c795b436 100644 --- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c +++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c @@ -4812,10 +4812,6 @@ glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr, goto out; } - /*TODO: As of now start the bricks as part of snapshot creation op. - brick_start releases the big_lock and this can cause regression - for bug# 1088355. - We need to fix brick_connect not to release big_lock*/ list_for_each_entry (snap_vol, &snap->volumes, vol_list) { list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) { ret = glusterd_brick_start (snap_vol, brickinfo, diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 540e97b8633..a2a746d247e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -1717,11 +1717,9 @@ glusterd_brick_connect (glusterd_volinfo_t *volinfo, if (ret < 0) goto out; - synclock_unlock (&priv->big_lock); ret = glusterd_rpc_create (&rpc, options, glusterd_brick_rpc_notify, brickid); - synclock_lock (&priv->big_lock); if (ret) { GF_FREE (brickid); goto out; @@ -5735,11 +5733,9 @@ glusterd_nodesvc_connect (char *server, char *socketpath) 600); if (ret) goto out; - synclock_unlock (&priv->big_lock); ret = glusterd_rpc_create (&rpc, options, glusterd_nodesvc_rpc_notify, server); - synclock_lock (&priv->big_lock); if (ret) goto out; (void) glusterd_nodesvc_set_rpc (server, rpc); |