summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAtin Mukherjee <amukherj@redhat.com>2016-09-08 11:33:59 +0530
committerNiels de Vos <ndevos@redhat.com>2016-09-30 06:27:26 -0700
commitad9a1d1e8430388995f6a3fcd192ada7c9417a8d (patch)
tree8d0a8cddf083e2b59a7b4007d5fe0ff9df3d7893
parent7a50690e8939d1f806556ece1bad9fc2a81c3b80 (diff)
socket: pollerr event shouldn't trigger socket_connnect_finish
If connect fails with any other error than EINPROGRESS we cannot get the error status using getsockopt (... SO_ERROR ... ). Hence we need to remember the state of connect and take appropriate action in the event_handler for the same. As an added note, a event can come where poll_err is HUP and we have poll_in as well (i.e some status was written to the socket), so for such cases we need to finish the connect, process the data and then the poll_err as is the case in the current code. Special thanks to Kaushal M & Raghavendra G for figuring out the issue. >Signed-off-by: Shyam <srangana@redhat.com> >Reviewed-on: http://review.gluster.org/15440 >Smoke: Gluster Build System <jenkins@build.gluster.org> >NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> >CentOS-regression: Gluster Build System <jenkins@build.gluster.org> >Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Change-Id: Ic45ad59ff8ab1d0a9d2cab2c924ad940b9d38528 BUG: 1373723 Signed-off-by: Atin Mukherjee <amukherj@redhat.com> Reviewed-on: http://review.gluster.org/15532 NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
-rw-r--r--rpc/rpc-transport/socket/src/socket.c42
-rw-r--r--rpc/rpc-transport/socket/src/socket.h3
-rw-r--r--tests/bugs/changelog/bug-1211327.t8
-rw-r--r--tests/bugs/ec/bug-1236065.t4
4 files changed, 49 insertions, 8 deletions
diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c
index deec0cbc6f4..d9383c36117 100644
--- a/rpc/rpc-transport/socket/src/socket.c
+++ b/rpc/rpc-transport/socket/src/socket.c
@@ -2346,6 +2346,7 @@ out:
return ret;
}
+static int socket_disconnect (rpc_transport_t *this);
/* reads rpc_requests during pollin */
static int
@@ -2370,7 +2371,23 @@ socket_event_handler (int fd, int idx, void *data,
}
pthread_mutex_unlock (&priv->lock);
- ret = (priv->connected == 1) ? 0 : socket_connect_finish(this);
+ if (priv->connected != 1) {
+ if (priv->connect_failed) {
+ /* connect failed with some other error than
+ EINPROGRESS or ENOENT, so nothing more to do, fail
+ reading/writing anything even if poll_in or poll_out
+ is set */
+ ret = socket_disconnect (this);
+
+ /* Force ret to be -1, as we are officially done with
+ this socket */
+ ret = -1;
+ } else {
+ ret = socket_connect_finish (this);
+ }
+ } else {
+ ret = 0;
+ }
if (!ret && poll_out) {
ret = socket_event_poll_out (this);
@@ -3046,6 +3063,16 @@ socket_connect (rpc_transport_t *this, int port)
gf_log (this->name, GF_LOG_WARNING,
"Ignore failed connection attempt on %s, (%s) ",
this->peerinfo.identifier, strerror (errno));
+
+ /* connect failed with some other error than EINPROGRESS
+ so, getsockopt (... SO_ERROR ...), will not catch any
+ errors and return them to us, we need to remember this
+ state, and take actions in socket_event_handler
+ appropriately */
+ /* TBD: What about ENOENT, we will do getsockopt there
+ as well, so how is that exempt from such a problem? */
+ priv->connect_failed = 1;
+
goto handler;
}
@@ -3058,9 +3085,22 @@ socket_connect (rpc_transport_t *this, int port)
GF_LOG_DEBUG : GF_LOG_ERROR),
"connection attempt on %s failed, (%s)",
this->peerinfo.identifier, strerror (errno));
+
+ /* connect failed with some other error than EINPROGRESS
+ so, getsockopt (... SO_ERROR ...), will not catch any
+ errors and return them to us, we need to remember this
+ state, and take actions in socket_event_handler
+ appropriately */
+ /* TBD: What about ENOENT, we will do getsockopt there
+ as well, so how is that exempt from such a problem? */
+ priv->connect_failed = 1;
+
goto handler;
}
else {
+ /* reset connect_failed so that any previous attempts
+ state is not carried forward */
+ priv->connect_failed = 0;
ret = 0;
}
diff --git a/rpc/rpc-transport/socket/src/socket.h b/rpc/rpc-transport/socket/src/socket.h
index 8395fd2fa58..7c7005b59e7 100644
--- a/rpc/rpc-transport/socket/src/socket.h
+++ b/rpc/rpc-transport/socket/src/socket.h
@@ -200,6 +200,9 @@ typedef struct {
int32_t idx;
/* -1 = not connected. 0 = in progress. 1 = connected */
char connected;
+ /* 1 = connect failed for reasons other than EINPROGRESS/ENOENT
+ see socket_connect for details */
+ char connect_failed;
char bio;
char connect_finish_log;
char submit_log;
diff --git a/tests/bugs/changelog/bug-1211327.t b/tests/bugs/changelog/bug-1211327.t
index 19d6e76ecab..a849ec3981f 100644
--- a/tests/bugs/changelog/bug-1211327.t
+++ b/tests/bugs/changelog/bug-1211327.t
@@ -27,15 +27,13 @@ TEST $CLI volume set $V0 changelog.changelog on;
sleep 1
TEST killall_gluster;
-sleep 1
-EXPECT 0 online_brick_count;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" online_brick_count
TEST glusterd;
TEST pidof glusterd;
-##Let the brick processes starts
-sleep 1;
-EXPECT 1 online_brick_count;
+##Let the brick processes starts
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" online_brick_count
##On brick restart only one HTIME should be found.
EXPECT 1 count_htime_files;
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
index 9038cb95948..e425f3effe2 100644
--- a/tests/bugs/ec/bug-1236065.t
+++ b/tests/bugs/ec/bug-1236065.t
@@ -48,7 +48,7 @@ TEST ec_test_make
## step 4
TEST $CLI volume start $V0 force
-EXPECT '7' online_brick_count
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "7" online_brick_count
# active heal
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
@@ -70,7 +70,7 @@ EXPECT '5' online_brick_count
## step 6
TEST $CLI volume start $V0 force
-EXPECT '7' online_brick_count
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "7" online_brick_count
# self-healing
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid