glusterd: Add multiple checks before attach/start a brick

Problem: In brick mux scenario sometime glusterd is not able to start/attach a brick and gluster v status shows brick is already running Solution: 1) To make sure brick is running check brick_path in /proc/<pid>/fd , if a brick is consumed by the brick process it means brick stack is come up otherwise not 2) Before start/attach a brick check if a brick is mounted or not 3) At the time of printing volume status check brick is consumed by any brick process Test: To test the same followed procedure 1) Setup brick mux environment on a vm 2) Put a breaking point in gdb in function posix_health_check_thread_proc at the time of notify GF_EVENT_CHILD_DOWN event 3) unmount anyone brick path forcefully 4) check gluster v status it will show N/A for the brick 5) Try to start volume with force option, glusterd throw message "No device available for mount brick" 6) Mount the brick_root path 7) Try to start volume with force option 8) down brick is started successfully Change-Id: I91898dad21d082ebddd12aa0d1f7f0ed012bdf69 fixes: bz#1595320 Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
author: Mohit Agrawal <moagrawal@redhat.com> 2018-07-12 13:29:48 +0530
committer: Atin Mukherjee <amukherj@redhat.com> 2018-07-27 01:24:09 +0000
commit: 9400b6f2c8aa219a493961e0ab9770b7f12e80d2 (patch)
tree: 50e31f0467b154d39f3d602e5a0f05d65d38a76c /xlators/mgmt
parent: 2836e158f38eb9ed070de88b64a3a8758cd2d4c0 (diff)
4 files changed, 226 insertions, 49 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
index ef3ca7014b1..a47b921b3c8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -2800,7 +2800,7 @@ glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv);
         if (gf_is_service_running (pidfile, &pid)) {
                 (void) send_attach_req (this, brickinfo->rpc,
-                                        brickinfo->path, NULL,
+                                        brickinfo->path, NULL, NULL,
                                         GLUSTERD_BRICK_TERMINATE);
                 brickinfo->status = GF_BRICK_STOPPED;
         }
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index d791dc3b977..1d4de35fad6 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -2241,7 +2241,7 @@ retry:
                 goto out;
         }
 
-        ret = glusterd_brick_process_add_brick (brickinfo, volinfo);
+        ret = glusterd_brick_process_add_brick (brickinfo);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         GD_MSG_BRICKPROC_ADD_BRICK_FAILED, "Adding brick %s:%s "
@@ -2434,8 +2434,7 @@ out:
 }
 
 int
-glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo,
-                                  glusterd_volinfo_t *volinfo)
+glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo)
 {
         int                      ret = -1;
         xlator_t                *this = NULL;
@@ -2563,7 +2562,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
                                       brickinfo->hostname, brickinfo->path);
 
                         (void) send_attach_req (this, brickinfo->rpc,
-                                                brickinfo->path, NULL,
+                                                brickinfo->path, NULL, NULL,
                                                 GLUSTERD_BRICK_TERMINATE);
                 } else {
                         gf_msg_debug (this->name, 0, "About to stop glusterfsd"
@@ -5508,23 +5507,93 @@ static int32_t
 attach_brick_callback (struct rpc_req *req, struct iovec *iov, int count,
                        void *v_frame)
 {
-        call_frame_t    *frame  = v_frame;
-        glusterd_conf_t *conf   = frame->this->private;
-        glusterd_brickinfo_t *brickinfo = frame->local;
+        call_frame_t                     *frame     = v_frame;
+        glusterd_conf_t                  *conf      = frame->this->private;
+        glusterd_brickinfo_t             *brickinfo = frame->local;
+        glusterd_brickinfo_t             *other_brick = frame->cookie;
+        glusterd_volinfo_t               *volinfo   =  NULL;
+        xlator_t                         *this      = THIS;
+        int                               ret       = -1;
+        char                              pidfile1[PATH_MAX]      = {0};
+        char                              pidfile2[PATH_MAX]      = {0};
+        gf_getspec_rsp                    rsp   = {0,};
+        int                               last_brick = -1;
 
         frame->local = NULL;
-        brickinfo->port_registered = _gf_true;
+        frame->cookie = NULL;
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
+        if (ret < 0) {
+                gf_log (frame->this->name, GF_LOG_ERROR, "XDR decoding error");
+                ret   = -1;
+                goto out;
+        }
+
+        ret =  glusterd_get_volinfo_from_brick (other_brick->path,
+                                                &volinfo);
+        if (ret) {
+                gf_msg (THIS->name, GF_LOG_ERROR, 0,
+                        GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
+                        " from brick(%s) so  pidfile copying/unlink will fail",
+                        other_brick->path);
+                goto out;
+        }
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile1, volinfo, other_brick, conf);
+        volinfo = NULL;
+
+        ret =  glusterd_get_volinfo_from_brick (brickinfo->path,
+                                                &volinfo);
+        if (ret) {
+                gf_msg (THIS->name, GF_LOG_ERROR, 0,
+                        GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
+                        " from brick(%s) so  pidfile copying/unlink will fail",
+                        brickinfo->path);
+                goto out;
+        }
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
+
+        if (rsp.op_ret == 0) {
+                brickinfo->port_registered = _gf_true;
+
+                /* PID file is copied once brick has attached
+                   successfully
+                */
+                glusterd_copy_file (pidfile1, pidfile2);
+                brickinfo->status = GF_BRICK_STARTED;
+                brickinfo->rpc = rpc_clnt_ref (other_brick->rpc);
+                gf_log (THIS->name, GF_LOG_INFO, "brick %s is attached successfully",
+                        brickinfo->path);
+        } else {
+                gf_log (THIS->name, GF_LOG_INFO, "attach_brick failed pidfile"
+                        " is %s for brick_path %s", pidfile2, brickinfo->path);
+                brickinfo->port = 0;
+                brickinfo->status = GF_BRICK_STOPPED;
+                ret = glusterd_brick_process_remove_brick (brickinfo, &last_brick);
+                if (ret)
+                        gf_msg_debug (this->name, 0, "Couldn't remove brick from"
+                                      " brick process");
+                LOCK (&volinfo->lock);
+                ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+                UNLOCK (&volinfo->lock);
+                if (ret) {
+                       gf_msg (this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_VOLINFO_SET_FAIL,
+                               "Failed to store volinfo of "
+                               "%s volume", volinfo->volname);
+                       goto out;
+                }
+        }
+out:
         synclock_lock (&conf->big_lock);
         --(conf->blockers);
         synclock_unlock (&conf->big_lock);
-
         STACK_DESTROY (frame->root);
         return 0;
 }
 
 int
 send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
-                 glusterd_brickinfo_t *brickinfo, int op)
+                 glusterd_brickinfo_t *brickinfo, glusterd_brickinfo_t *other_brick, int op)
 {
         int                             ret      = -1;
         struct iobuf                    *iobuf    = NULL;
@@ -5598,6 +5667,7 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
 
         if (op == GLUSTERD_BRICK_ATTACH) {
                 frame->local = brickinfo;
+                frame->cookie = other_brick;
                 cbkfn = attach_brick_callback;
         }
         /* Send the msg */
@@ -5671,27 +5741,19 @@ attach_brick (xlator_t *this,
                 rpc = rpc_clnt_ref (other_brick->rpc);
                 if (rpc) {
                         ret = send_attach_req (this, rpc, path, brickinfo,
+                                               other_brick,
                                                GLUSTERD_BRICK_ATTACH);
                         rpc_clnt_unref (rpc);
                         if (!ret) {
                                 ret = pmap_registry_extend (this, other_brick->port,
-                                                            brickinfo->path);
+                                            brickinfo->path);
                                 if (ret != 0) {
                                         gf_log (this->name, GF_LOG_ERROR,
                                                 "adding brick to process failed");
-                                        return ret;
+                                        goto out;
                                 }
-
-                                /* PID file is copied once brick has attached
-                                  successfully
-                                */
-                                glusterd_copy_file (pidfile1, pidfile2);
                                 brickinfo->port = other_brick->port;
-                                brickinfo->status = GF_BRICK_STARTED;
-                                brickinfo->rpc =
-                                        rpc_clnt_ref (other_brick->rpc);
-                                ret = glusterd_brick_process_add_brick (brickinfo,
-                                                                        volinfo);
+                                ret = glusterd_brick_process_add_brick (brickinfo);
                                 if (ret) {
                                         gf_msg (this->name, GF_LOG_ERROR, 0,
                                                 GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
@@ -5700,23 +5762,17 @@ attach_brick (xlator_t *this,
                                                 brickinfo->path);
                                         return ret;
                                 }
-
-                                if (ret) {
-                                        gf_msg_debug (this->name, 0, "Add brick"
-                                                    " to brick process failed");
-                                        return ret;
-                                }
-
                                 return 0;
                         }
                 }
                 /*
-                 * It might not actually be safe to manipulate the lock like
-                 * this, but if we don't then the connection can never actually
-                 * complete and retries are useless.  Unfortunately, all of the
-                 * alternatives (e.g. doing all of this in a separate thread)
-                 * are much more complicated and risky.  TBD: see if there's a
-                 * better way
+                 * It might not actually be safe to manipulate the lock
+                 * like this, but if we don't then the connection can
+                 * never actually complete and retries are useless.
+                 * Unfortunately, all of the alternatives (e.g. doing
+                 * all of this in a separate thread) are much more
+                 * complicated and risky.
+                 * TBD: see if there's a better way
                  */
                 synclock_unlock (&conf->big_lock);
                 sleep (1);
@@ -5945,6 +6001,7 @@ find_compatible_brick (glusterd_conf_t *conf,
         return NULL;
 }
 
+
 /* Below function is use to populate sockpath based on passed pid
    value as a argument after check the value from proc and also
    check if passed pid is match with running  glusterfs process
@@ -6031,6 +6088,62 @@ glusterd_get_sock_from_brick_pid (int pid, char *sockpath, size_t len)
 }
 
 
+char *
+search_brick_path_from_proc (pid_t brick_pid, char *brickpath)
+{
+        struct dirent *dp = NULL;
+        DIR *dirp = NULL;
+        size_t len = 0;
+        int fd = -1;
+        char path[PATH_MAX] = {0,};
+        char sym[PATH_MAX] = {0,};
+        struct dirent  scratch[2] = {{0,},};
+        char *brick_path = NULL;
+
+        if (!brickpath)
+                goto out;
+
+        sprintf(path, "/proc/%d/fd/", brick_pid);
+        dirp = sys_opendir (path);
+        if (!dirp)
+                goto out;
+
+        len = strlen (path);
+        if (len >= (sizeof(path) - 2))
+                goto out;
+
+        fd = dirfd (dirp);
+        if (fd  < 0)
+                goto out;
+
+        memset(path, 0, sizeof(path));
+        memset(sym, 0, sizeof(sym));
+
+        while ((dp = sys_readdir(dirp, scratch))) {
+                if (!strcmp(dp->d_name, ".") ||
+                    !strcmp(dp->d_name, ".."))
+                        continue;
+
+                /* check for non numerical descriptors */
+                if (!strtol(dp->d_name, (char **)NULL, 10))
+                        continue;
+
+                len = readlinkat (fd, dp->d_name, sym, sizeof(sym) - 1);
+                if (len > 1) {
+                        sym[len] = '\0';
+                        if (!strcmp (sym, brickpath)) {
+                                brick_path = gf_strdup(sym);
+                                break;
+                        }
+                        memset (sym, 0, sizeof (sym));
+                }
+        }
+out:
+        sys_closedir(dirp);
+        return brick_path;
+}
+
+
 int
 glusterd_brick_start (glusterd_volinfo_t *volinfo,
                       glusterd_brickinfo_t *brickinfo,
@@ -6044,7 +6157,9 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
         int32_t                 pid                   = -1;
         char                    pidfile[PATH_MAX]     = {0};
         char                    socketpath[PATH_MAX]  = {0};
+        char                    *brickpath            = NULL;
         glusterd_volinfo_t      *other_vol;
+        struct statvfs           brickstat = {0,};
 
         this = THIS;
         GF_ASSERT (this);
@@ -6090,6 +6205,28 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                 brickinfo->start_triggered = _gf_true;
 
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
+
+        ret = sys_statvfs (brickinfo->path, &brickstat);
+        if (ret) {
+                gf_msg (this->name, GF_LOG_ERROR,
+                        errno, GD_MSG_BRICKINFO_CREATE_FAIL,
+                        "failed to get statfs() call on brick %s",
+                        brickinfo->path);
+                goto out;
+        }
+
+        /* Compare fsid is helpful to ensure the existence of a brick_root
+           path before the start/attach a brick
+        */
+        if (brickinfo->statfs_fsid &&
+            (brickinfo->statfs_fsid != brickstat.f_fsid)) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "fsid comparison is failed it means Brick root path"
+                        " %s is not created by glusterd, start/attach will also fail",
+                        brickinfo->path);
+                goto out;
+        }
+
         if (gf_is_service_running (pidfile, &pid)) {
                 if (brickinfo->status != GF_BRICK_STARTING &&
                     brickinfo->status != GF_BRICK_STARTED) {
@@ -6109,12 +6246,29 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                          * TBD: re-use RPC connection across bricks
                          */
                         if (is_brick_mx_enabled ()) {
+                                brickpath = search_brick_path_from_proc (pid, brickinfo->path);
+                                if (!brickpath) {
+                                        gf_log (this->name, GF_LOG_INFO,
+                                                "Either pid %d is not running or brick"
+                                                " path %s is not consumed so cleanup pidfile",
+                                                pid, brickinfo->path);
+                                        /* search brick is failed so unlink pidfile */
+                                        if (sys_access (pidfile , R_OK) == 0) {
+                                                sys_unlink (pidfile);
+                                        }
+                                        goto run;
+                                }
+                                GF_FREE (brickpath);
                                 ret = glusterd_get_sock_from_brick_pid (pid, socketpath,
                                                                         sizeof(socketpath));
                                 if (ret) {
-                                        gf_log (this->name, GF_LOG_DEBUG,
+                                        gf_log (this->name, GF_LOG_INFO,
                                                 "Either pid %d is not running or is not match"
                                                 " with any running brick process ", pid);
+                                        /* Fetch unix socket is failed so unlink pidfile */
+                                        if (sys_access (pidfile , R_OK) == 0) {
+                                                sys_unlink (pidfile);
+                                        }
                                         goto run;
                                 }
                         } else {
@@ -6129,7 +6283,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                         (void) glusterd_brick_connect (volinfo, brickinfo,
                                         socketpath);
 
-                        ret = glusterd_brick_process_add_brick (brickinfo, volinfo);
+                        ret = glusterd_brick_process_add_brick (brickinfo);
                         if (ret) {
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
                                         GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
@@ -6169,6 +6323,10 @@ run:
                 if (ret == 0) {
                         goto out;
                 }
+                /* Attach_brick is failed so unlink pidfile */
+                if (sys_access (pidfile , R_OK) == 0) {
+                        sys_unlink (pidfile);
+                }
         }
 
         /*
@@ -7137,14 +7295,15 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
                             dict_t  *dict, int32_t count)
 {
 
-        int             ret                   = -1;
-        int32_t         pid                   = -1;
-        char            key[1024 + 16]        = {0};
-        char            base_key[1024]        = {0};
-        char            pidfile[PATH_MAX]     = {0};
+        int              ret                  = -1;
+        int32_t          pid                  = -1;
+        char             key[1024]            = {0};
+        char             base_key[1024]       = {0};
+        char             pidfile[PATH_MAX]    = {0};
         xlator_t        *this                 = NULL;
         glusterd_conf_t *priv                 = NULL;
-        gf_boolean_t    brick_online          = _gf_false;
+        gf_boolean_t     brick_online         = _gf_false;
+        char            *brickpath            = NULL;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brickinfo);
@@ -7201,7 +7360,20 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
         if (glusterd_is_brick_started (brickinfo)) {
                 if (gf_is_service_running (pidfile, &pid) &&
                     brickinfo->port_registered) {
-                        brick_online = _gf_true;
+                        if (!is_brick_mx_enabled ()) {
+                                brick_online = _gf_true;
+                        } else {
+                                brickpath = search_brick_path_from_proc (pid, brickinfo->path);
+                                if (!brickpath) {
+                                        gf_log (this->name, GF_LOG_INFO,
+                                                "brick path %s is not consumed",
+                                                brickinfo->path);
+                                        brick_online = _gf_false;
+                                } else {
+                                        brick_online = _gf_true;
+                                        GF_FREE (brickpath);
+                                }
+                        }
                 } else {
                         pid = -1;
                 }
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index 51e7c43d781..eded8e231d4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -178,8 +178,7 @@ int32_t
 glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo);
 
 int
-glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo,
-                                  glusterd_volinfo_t *volinfo);
+glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo);
 
 int
 glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo,
@@ -200,7 +199,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
 
 int
 send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
-                 glusterd_brickinfo_t *brick, int op);
+                 glusterd_brickinfo_t *brick,
+                 glusterd_brickinfo_t *other_brick, int op);
 
 glusterd_volinfo_t *
 glusterd_volinfo_ref (glusterd_volinfo_t *volinfo);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index b6cef79df7f..4c1f0be3d06 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -2608,8 +2608,13 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
         }
 
         glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STARTED);
-
+        /* Update volinfo on disk in critical section because
+           attach_brick_callback can also call store_volinfo for same
+           volume to update volinfo on disk
+        */
+        LOCK (&volinfo->lock);
         ret = glusterd_store_volinfo (volinfo, verincrement);
+        UNLOCK (&volinfo->lock);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         GD_MSG_VOLINFO_SET_FAIL,
author	Mohit Agrawal <moagrawal@redhat.com>	2018-07-12 13:29:48 +0530
committer	Atin Mukherjee <amukherj@redhat.com>	2018-07-27 01:24:09 +0000
commit	9400b6f2c8aa219a493961e0ab9770b7f12e80d2 (patch)
tree	50e31f0467b154d39f3d602e5a0f05d65d38a76c /xlators/mgmt
parent	2836e158f38eb9ed070de88b64a3a8758cd2d4c0 (diff)