diff options
| author | Mohit Agrawal <moagrawa@redhat.com> | 2018-03-12 19:43:15 +0530 | 
|---|---|---|
| committer | Raghavendra G <rgowdapp@redhat.com> | 2018-04-19 04:31:51 +0000 | 
| commit | 0043c63f70776444f69667a4ef9596217ecb42b7 (patch) | |
| tree | e6c239e4b27198d40bca329edcce317ded59de09 /glusterfsd | |
| parent | be26b0da2f1a7fe336400de6a1c016716983bd38 (diff) | |
gluster: Sometimes Brick process is crashed at the time of stopping brick
Problem: Sometimes brick process is getting crashed at the time
         of stop brick while brick mux is enabled.
Solution: Brick process was getting crashed because of rpc connection
          was not cleaning properly while brick mux is enabled.In this patch
          after sending GF_EVENT_CLEANUP notification to xlator(server)
          waits for all rpc client connection destroy for specific xlator.Once rpc
          connections are destroyed in server_rpc_notify for all associated client
          for that brick then call xlator_mem_cleanup for for brick xlator as well as
          all child xlators.To avoid races at the time of cleanup introduce
          two new flags at each xlator cleanup_starting, call_cleanup.
BUG: 1544090
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
Note: Run all test-cases in separate build (https://review.gluster.org/#/c/19700/)
      with same patch after enable brick mux forcefully, all test cases are
      passed.
Change-Id: Ic4ab9c128df282d146cf1135640281fcb31997bf
updates: bz#1544090
Diffstat (limited to 'glusterfsd')
| -rw-r--r-- | glusterfsd/src/glusterfsd-mgmt.c | 96 | 
1 files changed, 75 insertions, 21 deletions
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index bce8d5cc276..e6c8a27e9e8 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -194,11 +194,6 @@ xlator_mem_free (xlator_t *xl)          if (!xl)                  return 0; -        GF_FREE (xl->name); -        GF_FREE (xl->type); -        xl->name = NULL; -        xl->type = NULL; -          if (xl->options) {                  dict_ref (xl->options);                  dict_unref (xl->options); @@ -210,25 +205,38 @@ xlator_mem_free (xlator_t *xl)                  GF_FREE (vol_opt);          } +        xlator_memrec_free (xl); +          return 0;  }  void  xlator_call_fini (xlator_t *this) { -        if (!this) +        if (!this || this->cleanup_starting)                  return; +        this->cleanup_starting = 1; +        this->call_cleanup = 1;          xlator_call_fini (this->next);          this->fini (this);  }  void  xlator_mem_cleanup (xlator_t *this) { -        xlator_list_t     *list = this->children; -        xlator_t          *trav = list->xlator; -        inode_table_t     *inode_table = NULL; -        xlator_t          *prev = trav; +        xlator_list_t     *list         = this->children; +        xlator_t          *trav         = list->xlator; +        inode_table_t     *inode_table  = NULL; +        xlator_t          *prev         = trav; +        glusterfs_ctx_t   *ctx          = NULL; +        xlator_list_t    **trav_p       = NULL; +        xlator_t          *top          = NULL; +        xlator_t          *victim       = NULL; -        inode_table = this->itable; + +        if (this->call_cleanup) +                return; + +        this->call_cleanup = 1; +        ctx = glusterfsd_ctx;          xlator_call_fini (trav); @@ -238,6 +246,7 @@ xlator_mem_cleanup (xlator_t *this) {                  prev = trav;          } +        inode_table = this->itable;          if (inode_table) {                  inode_table_destroy (inode_table);                  this->itable = NULL; @@ -249,6 +258,33 @@ xlator_mem_cleanup (xlator_t *this) {          xlator_mem_free (this); +        if (glusterfsd_ctx->active) { +                top = glusterfsd_ctx->active->first; +                LOCK (&ctx->volfile_lock); +                /* TODO here we have leak for xlator node in a graph */ +                for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { +                        victim = (*trav_p)->xlator; +                        if (victim->call_cleanup && !strcmp (victim->name, this->name)) { +                                        (*trav_p) = (*trav_p)->next; +                                        break; +                        } +                } +                /* TODO Sometime brick xlator is not moved from graph so followed below +                   approach to move brick xlator from a graph, will move specific brick +                   xlator from graph only while inode table and mem_acct are cleaned up +                */ +                trav_p = &top->children; +                while (*trav_p) { +                        victim = (*trav_p)->xlator; +                        if (victim->call_cleanup && !victim->itable && !victim->mem_acct) { +                                (*trav_p) = (*trav_p)->next; +                        } else { +                                trav_p = &(*trav_p)->next; +                        } +                } +                UNLOCK (&ctx->volfile_lock); +        } +  } @@ -260,8 +296,10 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)          glusterfs_ctx_t         *ctx            = NULL;          xlator_t                *top            = NULL;          xlator_t                *victim         = NULL; +        xlator_t                *tvictim        = NULL;          xlator_list_t           **trav_p        = NULL;          gf_boolean_t            lockflag        = _gf_false; +        gf_boolean_t            last_brick      = _gf_false;          ret = xdr_to_generic (req->msg[0], &xlator_req,                                (xdrproc_t)xdr_gd1_mgmt_brick_op_req); @@ -279,11 +317,15 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)                          for (trav_p = &top->children; *trav_p;                                                      trav_p = &(*trav_p)->next) {                                  victim = (*trav_p)->xlator; -                                if (strcmp (victim->name, xlator_req.name) == 0) { +                                if (!victim->cleanup_starting && strcmp (victim->name, xlator_req.name) == 0) {                                          break;                                  }                          }                  } + +                if (!top) +                        goto err; +          }          if (!*trav_p) {                  gf_log (THIS->name, GF_LOG_ERROR, @@ -299,26 +341,38 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)          }          glusterfs_terminate_response_send (req, 0); -        if ((trav_p == &top->children) && !(*trav_p)->next) { +        for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) { +                tvictim = (*trav_p)->xlator; +                if (!tvictim->cleanup_starting && !strcmp (tvictim->name, xlator_req.name)) { +                        continue; +                } +                if (!tvictim->cleanup_starting) { +                        last_brick = _gf_true; +                        break; +                } +        } +        if (!last_brick) {                  gf_log (THIS->name, GF_LOG_INFO,                          "terminating after loss of last child %s",                          xlator_req.name);                  rpc_clnt_mgmt_pmap_signout (glusterfsd_ctx, xlator_req.name);                  kill (getpid(), SIGTERM);          } else { -                /* -                 * This is terribly unsafe without quiescing or shutting -                 * things down properly but it gets us to the point -                 * where we can test other stuff. -                 * -                 * TBD: finish implementing this "detach" code properly -                 */ +                /* TODO cleanup sequence needs to be done properly for +                   Quota and Changelog +                */ +                if (victim->cleanup_starting) +                        goto err; + +                victim->cleanup_starting = 1; +                  UNLOCK (&ctx->volfile_lock);                  lockflag = _gf_true; +                  gf_log (THIS->name, GF_LOG_INFO, "detaching not-only"                           " child %s", xlator_req.name);                  top->notify (top, GF_EVENT_CLEANUP, victim); -                xlator_mem_cleanup (victim); +          }  err:          if (!lockflag)  | 
