diff options
| author | Raghavendra G <rgowdapp@redhat.com> | 2017-02-28 13:13:59 +0530 | 
|---|---|---|
| committer | Raghavendra G <rgowdapp@redhat.com> | 2017-03-01 09:35:48 -0500 | 
| commit | 773f32caf190af4ee48818279b6e6d3c9f2ecc79 (patch) | |
| tree | e94281239124bc9933934bbc162e4057ac555621 /rpc | |
| parent | 13201e55299b4229a4481d886b12fd34d76044ac (diff) | |
rpc/clnt: remove locks while notifying CONNECT/DISCONNECT
Locking during notify was introduced as part of commit
aa22f24f5db7659387704998ae01520708869873 [1]. The fix was introduced
to fix out-of-order CONNECT/DISCONNECT events from rpc-clnt to parent
xlators [2]. However as part of handling DISCONNECT protocol/client
does unwind saved frames (with failure) waiting for responses. This
saved_frames_unwind can be a costly operation and hence ideally
shouldn't be included in the critical section of notifylock, as it
unnecessarily delays the reconnection to same brick. Also, its not a
good practise to pass control to other xlators holding a lock as it
can lead to deadlocks. So, this patch removes locking in rpc-clnt
while notifying parent xlators.
To fix [2], two changes are present in this patch:
* notify DISCONNECT before cleaning up rpc connection (same as commit
  a6b63e11b7758cf1bfcb6798, patch [3]).
* protocol/client uses rpc_clnt_cleanup_and_start, which cleans up rpc
  connection and does a start while handling a DISCONNECT event from
  rpc. Note that patch [3] was reverted as rpc_clnt_start called in
  quick_reconnect path of protocol/client didn't invoke connect on
  transport as the connection was not cleaned up _yet_ (as cleanup was
  moved post notification in rpc-clnt). This resulted in clients never
  attempting connect to bricks.
Note that one of the neater ways to fix [2] (without using locks) is
to introduce generation numbers to map CONNECT and DISCONNECTS across
epochs and ignore DISCONNECT events if they don't belong to current
epoch. However, this approach is a bit complex to implement and
requires time. So, current patch is a hacky stop-gap fix till we come
up with a more cleaner solution.
[1] http://review.gluster.org/15916
[2] https://bugzilla.redhat.com/show_bug.cgi?id=1386626
[3] http://review.gluster.org/15681
Change-Id: I62daeee8bb1430004e28558f6eb133efd4ccf418
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
BUG: 1427012
Reviewed-on: https://review.gluster.org/16784
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Milind Changire <mchangir@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'rpc')
| -rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.c | 94 | ||||
| -rw-r--r-- | rpc/rpc-lib/src/rpc-clnt.h | 4 | 
2 files changed, 67 insertions, 31 deletions
| diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index 6683718164b..9b5d5a112d9 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -549,6 +549,7 @@ rpc_clnt_connection_cleanup (rpc_clnt_connection_t *conn)                  /*reset rpc msgs stats*/                  conn->pingcnt = 0;                  conn->msgcnt = 0; +                conn->cleanup_gen++;          }          pthread_mutex_unlock (&conn->lock); @@ -874,10 +875,29 @@ rpc_clnt_destroy (struct rpc_clnt *rpc);  static int  rpc_clnt_handle_disconnect (struct rpc_clnt *clnt, rpc_clnt_connection_t *conn)  { -        struct timespec         ts          = {0, }; -        gf_boolean_t            unref_clnt  = _gf_false; +        struct timespec ts             = {0, }; +        gf_boolean_t    unref_clnt     = _gf_false; +        uint64_t        pre_notify_gen = 0, post_notify_gen = 0; -        rpc_clnt_connection_cleanup (conn); +        pthread_mutex_lock (&conn->lock); +        { +                pre_notify_gen = conn->cleanup_gen; +        } +        pthread_mutex_unlock (&conn->lock); + +        if (clnt->notifyfn) +                clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL); + +        pthread_mutex_lock (&conn->lock); +        { +                post_notify_gen = conn->cleanup_gen; +        } +        pthread_mutex_unlock (&conn->lock); + +        if (pre_notify_gen == post_notify_gen) { +                /* program didn't invoke cleanup, so rpc has to do it */ +                rpc_clnt_connection_cleanup (conn); +        }          pthread_mutex_lock (&conn->lock);          { @@ -897,8 +917,6 @@ rpc_clnt_handle_disconnect (struct rpc_clnt *clnt, rpc_clnt_connection_t *conn)          }          pthread_mutex_unlock (&conn->lock); -        if (clnt->notifyfn) -                clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL);          if (unref_clnt)                  rpc_clnt_unref (clnt); @@ -931,11 +949,7 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,          switch (event) {          case RPC_TRANSPORT_DISCONNECT:          { -                pthread_mutex_lock (&clnt->notifylock); -                { -                        rpc_clnt_handle_disconnect (clnt, conn); -                } -                pthread_mutex_unlock (&clnt->notifylock); +                rpc_clnt_handle_disconnect (clnt, conn);                  break;          } @@ -990,21 +1004,19 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,          case RPC_TRANSPORT_CONNECT:          { -                pthread_mutex_lock (&clnt->notifylock); -                { -                        /* Every time there is a disconnection, processes -                         * should try to connect to 'glusterd' (ie, default -                         * port) or whichever port given as 'option remote-port' -                         * in volume file. */ -                        /* Below code makes sure the (re-)configured port lasts -                         * for just one successful attempt */ -                        conn->config.remote_port = 0; - -                        if (clnt->notifyfn) -                                ret = clnt->notifyfn (clnt, clnt->mydata, -                                                RPC_CLNT_CONNECT, NULL); -                } -                pthread_mutex_unlock (&clnt->notifylock); + +                /* Every time there is a disconnection, processes +                 * should try to connect to 'glusterd' (ie, default +                 * port) or whichever port given as 'option remote-port' +                 * in volume file. */ +                /* Below code makes sure the (re-)configured port lasts +                 * for just one successful attempt */ +                conn->config.remote_port = 0; + +                if (clnt->notifyfn) +                        ret = clnt->notifyfn (clnt, clnt->mydata, +                                              RPC_CLNT_CONNECT, NULL); +                  break;          } @@ -1128,7 +1140,6 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,          }          pthread_mutex_init (&rpc->lock, NULL); -        pthread_mutex_init (&rpc->notifylock, NULL);          rpc->ctx = ctx;          rpc->owner = owner; @@ -1138,7 +1149,6 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,          rpc->reqpool = mem_pool_new (struct rpc_req, reqpool_size);          if (rpc->reqpool == NULL) {                  pthread_mutex_destroy (&rpc->lock); -                pthread_mutex_destroy (&rpc->notifylock);                  GF_FREE (rpc);                  rpc = NULL;                  goto out; @@ -1148,7 +1158,6 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,                                                 reqpool_size);          if (rpc->saved_frames_pool == NULL) {                  pthread_mutex_destroy (&rpc->lock); -                pthread_mutex_destroy (&rpc->notifylock);                  mem_pool_destroy (rpc->reqpool);                  GF_FREE (rpc);                  rpc = NULL; @@ -1158,7 +1167,6 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,          ret = rpc_clnt_connection_init (rpc, ctx, options, name);          if (ret == -1) {                  pthread_mutex_destroy (&rpc->lock); -                pthread_mutex_destroy (&rpc->notifylock);                  mem_pool_destroy (rpc->reqpool);                  mem_pool_destroy (rpc->saved_frames_pool);                  GF_FREE (rpc); @@ -1204,6 +1212,33 @@ rpc_clnt_start (struct rpc_clnt *rpc)  int +rpc_clnt_cleanup_and_start (struct rpc_clnt *rpc) +{ +        struct rpc_clnt_connection *conn = NULL; + +        if (!rpc) +                return -1; + +        conn = &rpc->conn; + +        rpc_clnt_connection_cleanup (conn); + +        pthread_mutex_lock (&conn->lock); +        { +                rpc->disabled = 0; +        } +        pthread_mutex_unlock (&conn->lock); +        /* Corresponding unref will be either on successful timer cancel or last +         * rpc_clnt_reconnect fire event. +         */ +        rpc_clnt_ref (rpc); +        rpc_clnt_reconnect (conn); + +        return 0; +} + + +int  rpc_clnt_register_notify (struct rpc_clnt *rpc, rpc_clnt_notify_t fn,                            void *mydata)  { @@ -1754,7 +1789,6 @@ rpc_clnt_destroy (struct rpc_clnt *rpc)          saved_frames_destroy (rpc->conn.saved_frames);          pthread_mutex_destroy (&rpc->lock);          pthread_mutex_destroy (&rpc->conn.lock); -        pthread_mutex_destroy (&rpc->notifylock);          /* mem-pool should be destroyed, otherwise,             it will cause huge memory leaks */ diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index 4d66498a0aa..b731ba2dfad 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -149,6 +149,7 @@ struct rpc_clnt_connection {  	int32_t                  ping_timeout;          uint64_t                 pingcnt;          uint64_t                 msgcnt; +        uint64_t                 cleanup_gen;  };  typedef struct rpc_clnt_connection rpc_clnt_connection_t; @@ -171,7 +172,6 @@ struct rpc_req {  typedef struct rpc_clnt {          pthread_mutex_t        lock; -        pthread_mutex_t        notifylock;          rpc_clnt_notify_t      notifyfn;          rpc_clnt_connection_t  conn;          void                  *mydata; @@ -198,6 +198,8 @@ struct rpc_clnt *rpc_clnt_new (dict_t *options, xlator_t *owner,  int rpc_clnt_start (struct rpc_clnt *rpc); +int rpc_clnt_cleanup_and_start (struct rpc_clnt *rpc); +  int rpc_clnt_register_notify (struct rpc_clnt *rpc, rpc_clnt_notify_t fn,                                void *mydata); | 
