summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVijaikumar M <vmallika@redhat.com>2014-05-23 14:42:08 +0530
committerRaghavendra G <rgowdapp@redhat.com>2014-06-05 10:11:45 -0700
commit42b956971c47fd0708cbbd17ce8c78c2ed79bfba (patch)
treec8056e730ff7277a6a323aad85ac8874862e4099
parenta89e35727e3a9a7226c7a16479935b7109b11663 (diff)
glusterd: Handle rpc_connect failure in the event handler
Currently rpc_connect calls the notification function on failure in the same thread, glusterd notification holds the big_lock and hence big_lock is released before rpc_connect In snapshot creation, releasing the big-lock before completeing operation can cause problem like deadlock or memory corruption. Bricks are started as part of snapshot created operation. brick_start releases the big_lock when doing brick_connect and this might cause glusterd crash. There is a similar issue in bug# 1088355. Solution is let the event handler handle the failure than doing it in the rpc_connect. Change-Id: I088d44092ce845a07516c1d67abd02b220e08b38 BUG: 1101507 Signed-off-by: Vijaikumar M <vmallika@redhat.com> Reviewed-on: http://review.gluster.org/7843 Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com> Reviewed-by: Jeff Darcy <jdarcy@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Tested-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.c4
-rw-r--r--rpc/rpc-transport/socket/src/Makefile.am2
-rw-r--r--rpc/rpc-transport/socket/src/socket-mem-types.h21
-rw-r--r--rpc/rpc-transport/socket/src/socket.c236
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c4
8 files changed, 169 insertions, 106 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index 44324a80431..8a460cfa617 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -429,10 +429,6 @@ rpc_clnt_reconnect (void *conn_ptr)
}
pthread_mutex_unlock (&conn->lock);
- if ((ret == -1) && (errno != EINPROGRESS) && (clnt->notifyfn)) {
- clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL);
- }
-
return;
}
diff --git a/rpc/rpc-transport/socket/src/Makefile.am b/rpc/rpc-transport/socket/src/Makefile.am
index 71e6ed6ff4a..5e909aceac8 100644
--- a/rpc/rpc-transport/socket/src/Makefile.am
+++ b/rpc/rpc-transport/socket/src/Makefile.am
@@ -1,4 +1,4 @@
-noinst_HEADERS = socket.h name.h
+noinst_HEADERS = socket.h name.h socket-mem-types.h
rpctransport_LTLIBRARIES = socket.la
rpctransportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/rpc-transport
diff --git a/rpc/rpc-transport/socket/src/socket-mem-types.h b/rpc/rpc-transport/socket/src/socket-mem-types.h
new file mode 100644
index 00000000000..e5553b172a2
--- /dev/null
+++ b/rpc/rpc-transport/socket/src/socket-mem-types.h
@@ -0,0 +1,21 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __SOCKET_MEM_TYPES_H__
+#define __SOCKET_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+typedef enum gf_sock_mem_types_ {
+ gf_sock_connect_error_state_t = gf_common_mt_end + 1,
+ gf_sock_mt_end
+} gf_sock_mem_types_t;
+
+#endif
diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c
index 6f566e49345..6d4a862aa8d 100644
--- a/rpc/rpc-transport/socket/src/socket.c
+++ b/rpc/rpc-transport/socket/src/socket.c
@@ -23,7 +23,7 @@
#include "byte-order.h"
#include "common-utils.h"
#include "compat-errno.h"
-
+#include "socket-mem-types.h"
/* ugly #includes below */
#include "protocol-common.h"
@@ -152,6 +152,13 @@ typedef int SSL_trinary_func (SSL *, void *, int);
__socket_proto_update_priv_after_read (priv, ret, bytes_read); \
}
+struct socket_connect_error_state_ {
+ xlator_t *this;
+ rpc_transport_t *trans;
+ gf_boolean_t refd;
+};
+typedef struct socket_connect_error_state_ socket_connect_error_state_t;
+
static int socket_init (rpc_transport_t *this);
static void
@@ -2652,19 +2659,41 @@ out:
return ret;
}
+void*
+socket_connect_error_cbk (void *opaque)
+{
+ socket_connect_error_state_t *arg;
+
+ GF_ASSERT (opaque);
+
+ arg = opaque;
+ THIS = arg->this;
+
+ rpc_transport_notify (arg->trans, RPC_TRANSPORT_DISCONNECT, arg->trans);
+
+ if (arg->refd)
+ rpc_transport_unref (arg->trans);
+
+ GF_FREE (opaque);
+ return NULL;
+}
static int
socket_connect (rpc_transport_t *this, int port)
{
- int ret = -1;
- int sock = -1;
- socket_private_t *priv = NULL;
- socklen_t sockaddr_len = 0;
- glusterfs_ctx_t *ctx = NULL;
- sa_family_t sa_family = {0, };
- char *local_addr = NULL;
- union gf_sock_union sock_union;
- struct sockaddr_in *addr = NULL;
+ int ret = -1;
+ int th_ret = -1;
+ int sock = -1;
+ socket_private_t *priv = NULL;
+ socklen_t sockaddr_len = 0;
+ glusterfs_ctx_t *ctx = NULL;
+ sa_family_t sa_family = {0, };
+ char *local_addr = NULL;
+ union gf_sock_union sock_union;
+ struct sockaddr_in *addr = NULL;
+ gf_boolean_t refd = _gf_false;
+ socket_connect_error_state_t *arg = NULL;
+ pthread_t th_id = {0, };
GF_VALIDATE_OR_GOTO ("socket", this, err);
GF_VALIDATE_OR_GOTO ("socket", this->private, err);
@@ -2680,52 +2709,43 @@ socket_connect (rpc_transport_t *this, int port)
pthread_mutex_lock (&priv->lock);
{
- sock = priv->sock;
- }
- pthread_mutex_unlock (&priv->lock);
-
- if (sock != -1) {
- gf_log_callingfn (this->name, GF_LOG_TRACE,
- "connect () called on transport already connected");
- errno = EINPROGRESS;
- ret = -1;
- goto err;
- }
+ if (priv->sock != -1) {
+ gf_log_callingfn (this->name, GF_LOG_TRACE,
+ "connect () called on transport "
+ "already connected");
+ errno = EINPROGRESS;
+ ret = -1;
+ goto unlock;
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "connecting %p, state=%u gen=%u sock=%d", this,
- priv->ot_state, priv->ot_gen, priv->sock);
+ gf_log (this->name, GF_LOG_TRACE,
+ "connecting %p, state=%u gen=%u sock=%d", this,
+ priv->ot_state, priv->ot_gen, priv->sock);
- ret = socket_client_get_remote_sockaddr (this, &sock_union.sa,
- &sockaddr_len, &sa_family);
- if (ret == -1) {
- /* logged inside client_get_remote_sockaddr */
- goto err;
- }
+ ret = socket_client_get_remote_sockaddr (this, &sock_union.sa,
+ &sockaddr_len, &sa_family);
+ if (ret == -1) {
+ /* logged inside client_get_remote_sockaddr */
+ goto unlock;
+ }
- if (port > 0) {
- sock_union.sin.sin_port = htons (port);
- }
- if (ntohs(sock_union.sin.sin_port) == GF_DEFAULT_SOCKET_LISTEN_PORT) {
- if (priv->use_ssl) {
- gf_log(this->name,GF_LOG_DEBUG,
- "disabling SSL for portmapper connection");
- priv->use_ssl = _gf_false;
+ if (port > 0) {
+ sock_union.sin.sin_port = htons (port);
}
- }
- else {
- if (priv->ssl_enabled && !priv->use_ssl) {
- gf_log(this->name,GF_LOG_DEBUG,
- "re-enabling SSL for I/O connection");
- priv->use_ssl = _gf_true;
+ if (ntohs(sock_union.sin.sin_port) ==
+ GF_DEFAULT_SOCKET_LISTEN_PORT) {
+ if (priv->use_ssl) {
+ gf_log(this->name,GF_LOG_DEBUG,
+ "disabling SSL for portmapper connection");
+ priv->use_ssl = _gf_false;
+ }
}
- }
- pthread_mutex_lock (&priv->lock);
- {
- if (priv->sock != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "connect() -- already connected");
- goto unlock;
+ else {
+ if (priv->ssl_enabled && !priv->use_ssl) {
+ gf_log(this->name,GF_LOG_DEBUG,
+ "re-enabling SSL for I/O connection");
+ priv->use_ssl = _gf_true;
+ }
}
memcpy (&this->peerinfo.sockaddr, &sock_union.storage,
@@ -2737,6 +2757,7 @@ socket_connect (rpc_transport_t *this, int port)
gf_log (this->name, GF_LOG_ERROR,
"socket creation failed (%s)",
strerror (errno));
+ ret = -1;
goto unlock;
}
@@ -2795,7 +2816,8 @@ socket_connect (rpc_transport_t *this, int port)
&local_addr);
if (!ret && SA (&this->myinfo.sockaddr)->sa_family == AF_INET) {
addr = (struct sockaddr_in *)(&this->myinfo.sockaddr);
- ret = inet_pton (AF_INET, local_addr, &(addr->sin_addr.s_addr));
+ ret = inet_pton (AF_INET, local_addr,
+ &(addr->sin_addr.s_addr));
}
ret = client_bind (this, SA (&this->myinfo.sockaddr),
@@ -2803,9 +2825,7 @@ socket_connect (rpc_transport_t *this, int port)
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"client bind failed: %s", strerror (errno));
- close (priv->sock);
- priv->sock = -1;
- goto unlock;
+ goto handler;
}
if (!priv->use_ssl && !priv->bio && !priv->own_thread) {
@@ -2814,9 +2834,7 @@ socket_connect (rpc_transport_t *this, int port)
gf_log (this->name, GF_LOG_ERROR,
"NBIO on %d failed (%s)",
priv->sock, strerror (errno));
- close (priv->sock);
- priv->sock = -1;
- goto unlock;
+ goto handler;
}
}
@@ -2832,21 +2850,20 @@ socket_connect (rpc_transport_t *this, int port)
GF_LOG_DEBUG : GF_LOG_ERROR),
"connection attempt on %s failed, (%s)",
this->peerinfo.identifier, strerror (errno));
- close (priv->sock);
- priv->sock = -1;
- goto unlock;
+ goto handler;
+ }
+ else {
+ ret = 0;
}
- if (priv->use_ssl && !priv->own_thread) {
- ret = ssl_setup_connection(this,0);
- if (ret < 0) {
- gf_log(this->name,GF_LOG_ERROR,
- "client setup failed");
- close(priv->sock);
- priv->sock = -1;
- goto unlock;
- }
- }
+ if (priv->use_ssl && !priv->own_thread) {
+ ret = ssl_setup_connection(this,0);
+ if (ret < 0) {
+ gf_log(this->name,GF_LOG_ERROR,
+ "client setup failed");
+ goto handler;
+ }
+ }
if (!priv->bio && !priv->own_thread) {
ret = __socket_nonblock (priv->sock);
@@ -2855,10 +2872,24 @@ socket_connect (rpc_transport_t *this, int port)
gf_log (this->name, GF_LOG_ERROR,
"NBIO on %d failed (%s)",
priv->sock, strerror (errno));
- close (priv->sock);
+ goto handler;
+ }
+ }
+
+handler:
+ if (ret < 0) {
+ if (priv->own_thread) {
+ close(priv->sock);
priv->sock = -1;
goto unlock;
}
+ else {
+ /* Ignore error from connect. epoll events
+ should be handled in the socket handler.
+ shutdown(2) will result in EPOLLERR, so
+ cleanup is done in socket_event_handler */
+ shutdown (priv->sock, SHUT_RDWR);
+ }
}
/*
@@ -2868,31 +2899,58 @@ socket_connect (rpc_transport_t *this, int port)
priv->connected = 0;
priv->is_server = _gf_false;
rpc_transport_ref (this);
+ refd = _gf_true;
- if (priv->own_thread) {
- if (pipe(priv->pipe) < 0) {
- gf_log(this->name,GF_LOG_ERROR,
- "could not create pipe");
- }
+ if (priv->own_thread) {
+ if (pipe(priv->pipe) < 0) {
+ gf_log(this->name,GF_LOG_ERROR,
+ "could not create pipe");
+ }
this->listener = this;
socket_spawn(this);
- }
- else {
- priv->idx = event_register (ctx->event_pool, priv->sock,
- socket_event_handler,
- this, 1, 1);
- if (priv->idx == -1) {
- gf_log ("", GF_LOG_WARNING,
- "failed to register the event");
- ret = -1;
- }
- }
- }
+ }
+ else {
+ priv->idx = event_register (ctx->event_pool, priv->sock,
+ socket_event_handler,
+ this, 1, 1);
+ if (priv->idx == -1) {
+ gf_log ("", GF_LOG_WARNING,
+ "failed to register the event");
+ close(priv->sock);
+ priv->sock = -1;
+ ret = -1;
+ }
+ }
+
unlock:
+ sock = priv->sock;
+ }
pthread_mutex_unlock (&priv->lock);
err:
+ /* if sock != -1, then cleanup is done from the event handler */
+ if (ret == -1 && sock == -1) {
+ /* Cleaup requires to send notification to upper layer which
+ intern holds the big_lock. There can be dead-lock situation
+ if big_lock is already held by the current thread.
+ So transfer the ownership to seperate thread for cleanup.
+ */
+ arg = GF_CALLOC (1, sizeof (*arg),
+ gf_sock_connect_error_state_t);
+ arg->this = THIS;
+ arg->trans = this;
+ arg->refd = refd;
+ th_ret = pthread_create (&th_id, NULL, socket_connect_error_cbk,
+ arg);
+ if (th_ret) {
+ gf_log (this->name, GF_LOG_ERROR, "pthread_create"
+ "failed: %s", strerror(errno));
+ GF_FREE (arg);
+ GF_ASSERT (0);
+ }
+ }
+
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 733bea236ed..e6b744af536 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -3135,9 +3135,7 @@ glusterd_friend_add (const char *hoststr, int port,
if (!restore) {
ret = glusterd_store_peerinfo (*friend);
if (ret == 0) {
- synclock_unlock (&conf->big_lock);
ret = glusterd_friend_rpc_create (this, *friend, args);
- synclock_lock (&conf->big_lock);
}
else {
gf_log (this->name, GF_LOG_ERROR,
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index cf2272028c6..f2e7338a80e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -374,10 +374,8 @@ glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
}
glusterd_volinfo_ref (volinfo);
- synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&defrag->rpc, options,
glusterd_defrag_notify, volinfo);
- synclock_lock (&priv->big_lock);
if (ret) {
gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
"Glusterd RPC creation failed");
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
index eef6129745a..7d3c795b436 100644
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -4812,10 +4812,6 @@ glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr,
goto out;
}
- /*TODO: As of now start the bricks as part of snapshot creation op.
- brick_start releases the big_lock and this can cause regression
- for bug# 1088355.
- We need to fix brick_connect not to release big_lock*/
list_for_each_entry (snap_vol, &snap->volumes, vol_list) {
list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
ret = glusterd_brick_start (snap_vol, brickinfo,
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 540e97b8633..a2a746d247e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1717,11 +1717,9 @@ glusterd_brick_connect (glusterd_volinfo_t *volinfo,
if (ret < 0)
goto out;
- synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&rpc, options,
glusterd_brick_rpc_notify,
brickid);
- synclock_lock (&priv->big_lock);
if (ret) {
GF_FREE (brickid);
goto out;
@@ -5735,11 +5733,9 @@ glusterd_nodesvc_connect (char *server, char *socketpath)
600);
if (ret)
goto out;
- synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&rpc, options,
glusterd_nodesvc_rpc_notify,
server);
- synclock_lock (&priv->big_lock);
if (ret)
goto out;
(void) glusterd_nodesvc_set_rpc (server, rpc);