From 123db32a53f7e2f99c0d63b368ed8a8ee6b41f62 Mon Sep 17 00:00:00 2001
From: Raghavendra Bhat <raghavendra@redhat.com>
Date: Thu, 20 Mar 2014 14:46:02 +0530
Subject: mgmt/glusterd: do cleanup of snapshots in post-validate phase if half
 baked                objects are there

Change-Id: I372cac98ad054cdc1a6fbc7f6c77c25981063b2f
Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com>
Reviewed-on: http://review.gluster.org/7237
Reviewed-by: Rajesh Joseph <rjoseph@redhat.com>
Tested-by: Rajesh Joseph <rjoseph@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c |   4 +-
 xlators/mgmt/glusterd/src/glusterd-mgmt.c         | 166 +++++++++++++++++-----
 xlators/mgmt/glusterd/src/glusterd-mgmt.h         |   2 +-
 xlators/mgmt/glusterd/src/glusterd-snapshot.c     | 147 ++++++++++++++++++-
 xlators/mgmt/glusterd/src/glusterd-store.c        |  57 ++++++++
 xlators/mgmt/glusterd/src/glusterd-utils.c        |   1 +
 xlators/mgmt/glusterd/src/glusterd.h              |   5 +
 7 files changed, 344 insertions(+), 38 deletions(-)

(limited to 'xlators')

diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
index 27d40b3a7..a2546ca94 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
@@ -659,8 +659,8 @@ glusterd_handle_post_validate_fn (rpcsvc_request_t *req)
                 return -1;
         }
 
-        ret = gd_mgmt_v3_post_validate_fn (op_req.op, dict, &op_errstr,
-                                          rsp_dict);
+        ret = gd_mgmt_v3_post_validate_fn (op_req.op, op_req.op_ret, dict,
+                                           &op_errstr, rsp_dict);
 
         if (ret) {
                 gf_log (this->name, GF_LOG_ERROR,
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
index cdc51849f..380f149a9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mgmt.c
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
@@ -187,14 +187,35 @@ out:
 }
 
 int32_t
-gd_mgmt_v3_post_validate_fn (glusterd_op_t op, dict_t *dict,
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
                             char **op_errstr, dict_t *rsp_dict)
 {
         int ret = -1;
         xlator_t *this = THIS;
 
+        GF_ASSERT (this);
+
+        switch (op) {
+               case GD_OP_SNAP:
+               {
+                       ret = glusterd_snapshot_postvalidate (dict, op_ret,
+                                                             op_errstr,
+                                                             rsp_dict);
+                       if (ret) {
+                               gf_log (this->name, GF_LOG_WARNING,
+                                       "postvalidate operation failed");
+                               goto out;
+                       }
+                       break;
+               }
+               default:
+                       break;
+        }
+
         ret = 0;
-        gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
+
+out:
+        gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret);
         return ret;
 }
 
@@ -211,9 +232,16 @@ gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
         int                         op_errno   = -1;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
+        /* Even though the lock command has failed, while collating the errors
+           (gd_mgmt_v3_collate_errors), args->op_ret and args->op_errno will be
+           used. @args is obtained from frame->local. So before checking the
+           status of the request and going out if its a failure, args should be
+           set to frame->local. Otherwise, while collating args will be NULL.
+           This applies to other phases such as prevalidate, brickop, commit and
+           postvalidate also.
+        */
         frame  = myframe;
         args   = frame->local;
         peerinfo = frame->cookie;
@@ -225,6 +253,12 @@ gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+                goto out;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
         if (ret < 0)
@@ -234,6 +268,7 @@ gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
 
         op_ret = rsp.op_ret;
         op_errno = rsp.op_errno;
+
 out:
         gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
                                    GLUSTERD_MGMT_V3_LOCK,
@@ -329,7 +364,7 @@ glusterd_mgmt_v3_initiate_lockdown (glusterd_conf_t  *conf, glusterd_op_t op,
         gd_synctask_barrier_wait((&args), peer_cnt);
 
         if (args.errstr)
-                 *op_errstr = gf_strdup (args.errstr);
+                *op_errstr = gf_strdup (args.errstr);
 
         ret = args.op_ret;
 
@@ -395,7 +430,6 @@ gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
         dict_t                     *rsp_dict   = NULL;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
         frame  = myframe;
@@ -409,6 +443,11 @@ gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
         if (ret < 0)
@@ -451,6 +490,7 @@ gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 op_ret = rsp.op_ret;
                 op_errno = rsp.op_errno;
         }
+
 out:
         if (rsp_dict)
                 dict_unref (rsp_dict);
@@ -641,7 +681,6 @@ gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov,
         int                         op_errno   = -1;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
         frame  = myframe;
@@ -650,11 +689,21 @@ gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov,
         frame->local = NULL;
         frame->cookie = NULL;
 
+        /* If the operation failed, then iov can be NULL. So better check the
+           status of the operation and then worry about iov (if the status of
+           the command is success)
+        */
         if (-1 == req->rpc_status) {
                 op_errno = ENOTCONN;
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+                goto out;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
         if (ret < 0)
@@ -664,10 +713,11 @@ gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov,
 
         op_ret = rsp.op_ret;
         op_errno = rsp.op_errno;
+
 out:
         gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
-                                  GLUSTERD_MGMT_V3_BRICK_OP,
-                                  peerinfo, rsp.uuid);
+                                   GLUSTERD_MGMT_V3_BRICK_OP,
+                                   peerinfo, rsp.uuid);
         STACK_DESTROY (frame->root);
         synctask_barrier_wake(args);
         return 0;
@@ -810,7 +860,6 @@ gd_mgmt_v3_commit_cbk_fn (struct rpc_req *req, struct iovec *iov,
         dict_t                     *rsp_dict   = NULL;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
         frame  = myframe;
@@ -824,6 +873,12 @@ gd_mgmt_v3_commit_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+                goto out;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
         if (ret < 0)
@@ -1022,7 +1077,6 @@ gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
         int                         op_errno   = -1;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
         frame  = myframe;
@@ -1036,6 +1090,12 @@ gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+                goto out;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
         if (ret < 0)
@@ -1045,6 +1105,7 @@ gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
 
         op_ret = rsp.op_ret;
         op_errno = rsp.op_errno;
+
 out:
         gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
                                   GLUSTERD_MGMT_V3_POST_VALIDATE,
@@ -1063,10 +1124,10 @@ gd_mgmt_v3_post_validate_cbk (struct rpc_req *req, struct iovec *iov,
 }
 
 int
-gd_mgmt_v3_post_validate (glusterd_op_t op, dict_t *op_ctx,
-                              glusterd_peerinfo_t *peerinfo,
-                              struct syncargs *args, uuid_t my_uuid,
-                              uuid_t recv_uuid)
+gd_mgmt_v3_post_validate (glusterd_op_t op, int32_t op_ret, dict_t *op_ctx,
+                          glusterd_peerinfo_t *peerinfo,
+                          struct syncargs *args, uuid_t my_uuid,
+                          uuid_t recv_uuid)
 {
         int                      ret  = -1;
         gd1_mgmt_v3_post_val_req  req  = {{0},};
@@ -1084,6 +1145,7 @@ gd_mgmt_v3_post_validate (glusterd_op_t op, dict_t *op_ctx,
 
         uuid_copy (req.uuid, my_uuid);
         req.op = op;
+        req.op_ret = op_ret;
         synclock_unlock (&conf->big_lock);
         ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
                                         &gd_mgmt_v3_prog,
@@ -1098,7 +1160,8 @@ out:
 
 int
 glusterd_mgmt_v3_post_validate (glusterd_conf_t  *conf, glusterd_op_t op,
-                               dict_t *req_dict, char **op_errstr, int npeers)
+                                int32_t op_ret, dict_t *req_dict,
+                                char **op_errstr, int npeers)
 {
         int                  ret        = -1;
         int                  peer_cnt   = 0;
@@ -1120,8 +1183,8 @@ glusterd_mgmt_v3_post_validate (glusterd_conf_t  *conf, glusterd_op_t op,
         }
 
         /* Post Validation on local node */
-        ret = gd_mgmt_v3_post_validate_fn (op, req_dict, op_errstr,
-                                          rsp_dict);
+        ret = gd_mgmt_v3_post_validate_fn (op, op_ret, req_dict, op_errstr,
+                                           rsp_dict);
 
         if (ret) {
                 gf_log (this->name, GF_LOG_ERROR,
@@ -1155,8 +1218,8 @@ glusterd_mgmt_v3_post_validate (glusterd_conf_t  *conf, glusterd_op_t op,
         synctask_barrier_init((&args));
         peer_cnt = 0;
         list_for_each_entry (peerinfo, peers, op_peers_list) {
-                gd_mgmt_v3_post_validate (op, req_dict, peerinfo, &args,
-                                             MY_UUID, peer_uuid);
+                gd_mgmt_v3_post_validate (op, op_ret, req_dict, peerinfo, &args,
+                                          MY_UUID, peer_uuid);
                 peer_cnt++;
         }
         gd_synctask_barrier_wait((&args), peer_cnt);
@@ -1190,7 +1253,6 @@ gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
         int                         op_errno   = -1;
 
         GF_ASSERT(req);
-        GF_ASSERT(iov);
         GF_ASSERT(myframe);
 
         frame  = myframe;
@@ -1204,6 +1266,12 @@ gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
                 goto out;
         }
 
+        if (!iov) {
+                gf_log (THIS->name, GF_LOG_ERROR, "iov is NULL");
+                op_errno = EINVAL;
+                goto out;
+        }
+
         ret = xdr_to_generic (*iov, &rsp,
                               (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
         if (ret < 0)
@@ -1213,6 +1281,7 @@ gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
 
         op_ret = rsp.op_ret;
         op_errno = rsp.op_errno;
+
 out:
         gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
                                    GLUSTERD_MGMT_V3_UNLOCK,
@@ -1413,7 +1482,11 @@ glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op,
         }
 
         /* POST-COMMIT VALIDATE PHASE */
-        ret = glusterd_mgmt_v3_post_validate (conf, op, req_dict,
+        /* As of now, post_validate is not handling any other
+           commands other than snapshot. So as of now, I am
+           sending 0 (op_ret as 0).
+        */
+        ret = glusterd_mgmt_v3_post_validate (conf, op, 0, req_dict,
                                              &op_errstr, npeers);
         if (ret) {
                 gf_log ("", GF_LOG_ERROR, "Post Validation Failed");
@@ -1470,6 +1543,7 @@ glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
         uuid_t                      *originator_uuid = NULL;
         gf_boolean_t                success          = _gf_false;
         char                        *tmp_errstr      = NULL;
+        int                         op_ret           = -1;
 
         this = THIS;
         GF_ASSERT (this);
@@ -1549,11 +1623,28 @@ glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
         ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
                                         &op_errstr, npeers);
         if (ret) {
-                gf_log ("", GF_LOG_ERROR, "Brick Ops Failed");
+                gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed");
                 goto unbarrier;
         }
 
         /* COMMIT OP PHASE */
+        /* TODO: As of now, the plan is to do quorum check before sending the
+           commit fop and if the quorum succeeds, then commit is sent to all
+           the other glusterds.
+           snap create functionality now creates the in memory and on disk
+           objects for the snapshot (marking them as incomplete), takes the lvm
+           snapshot and then updates the status of the in memory and on disk
+           snap objects as complete. Suppose one of the glusterds goes down
+           after taking the lvm snapshot, but before updating the snap object,
+           then treat it as a snapshot create failure and trigger cleanup.
+           i.e the number of commit responses received by the originator
+           glusterd shold be the same as the number of peers it has sent the
+           request to (i.e npeers variable). If not, then originator glusterd
+           will initiate cleanup in post-validate fop.
+           Question: What if one of the other glusterds goes down as explained
+           above and along with it the originator glusterd also goes down?
+           Who will initiate the cleanup?
+        */
         ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict,
                                       &op_errstr, npeers);
         if (ret) {
@@ -1576,25 +1667,31 @@ unbarrier:
                 goto out;
         ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
                                         &op_errstr, npeers);
-        if (ret || (success == _gf_false)) {
-                gf_log ("", GF_LOG_ERROR, "Brick Ops Failed");
-                ret = -1;
-                goto out;
-        }
-        /* POST-COMMIT VALIDATE PHASE */
-        ret = glusterd_mgmt_v3_post_validate (conf, op, req_dict,
-                                             &op_errstr, npeers);
+
         if (ret) {
-                gf_log ("", GF_LOG_ERROR, "Post Validation Failed");
+                gf_log ("", GF_LOG_ERROR, "Brick Ops Failed");
                 goto out;
         }
 
         ret = 0;
+
 out:
+        op_ret = ret;
+
+        if (success == _gf_false)
+                op_ret = -1;
+
+        /* POST-COMMIT VALIDATE PHASE */
+        ret = glusterd_mgmt_v3_post_validate (conf, op, op_ret, req_dict,
+                                              &op_errstr, npeers);
+        if (ret) {
+                gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed");
+                op_ret = -1;
+        }
 
         /* UNLOCK PHASE FOR PEERS*/
         (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict,
-                                                    ret, &op_errstr,
+                                                    op_ret, &op_errstr,
                                                     npeers, is_acquired);
 
         /* If the commit op (snapshot taking) failed, then the error is stored
@@ -1604,7 +1701,7 @@ out:
            is sent to cli.
         */
         if (tmp_errstr) {
-                if (ret && op_errstr) {
+                if (op_errstr) {
                         gf_log (this->name, GF_LOG_ERROR, "unbarrier brick op"
                                 "failed with the error %s", op_errstr);
                         GF_FREE (op_errstr);
@@ -1614,7 +1711,8 @@ out:
         }
 
         /* SEND CLI RESPONSE */
-        glusterd_op_send_cli_response (op, ret, 0, req, dict, op_errstr);
+
+        glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr);
 
         /* LOCAL VOLUME(S) UNLOCK */
         if (!is_acquired)
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
index 25ce32e01..685f59132 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mgmt.h
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
@@ -28,7 +28,7 @@ gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict,
                      char **op_errstr, dict_t *rsp_dict);
 
 int32_t
-gd_mgmt_v3_post_validate_fn (glusterd_op_t op, dict_t *dict,
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
                             char **op_errstr, dict_t *rsp_dict);
 
 int32_t
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
index db2d88831..5759bded4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -2685,7 +2685,12 @@ glusterd_create_snap_object (dict_t *dict)
         strcpy (snap->snapname, snapname);
         uuid_copy (snap->snap_id, *snap_id);
         snap->time_stamp = (time_t)time_stamp;
-        snap->snap_status = GD_SNAP_STATUS_IN_USE;
+        /* Set the status as GD_SNAP_STATUS_INIT and once the backend snapshot
+           is taken and snap is really ready to use, set the status to
+           GD_SNAP_STATUS_IN_USE. This helps in identifying the incomplete
+           snapshots and cleaning them up.
+        */
+        snap->snap_status = GD_SNAP_STATUS_INIT;
         if (description) {
                 snap->description = gf_strdup (description);
                 if (snap->description == NULL) {
@@ -2705,8 +2710,10 @@ glusterd_create_snap_object (dict_t *dict)
 
         list_add_order (&snap->snap_list, &priv->snapshots,
                         glusterd_compare_snap_time);
+
         gf_log (this->name, GF_LOG_TRACE, "Snap %s added to the list",
                 snap->snapname);
+
         ret = 0;
 
 out:
@@ -3557,6 +3564,61 @@ out:
         return ret;
 }
 
+int32_t
+glusterd_do_snap_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+        int32_t                  ret                   = -1;
+        char                     *name                 = NULL;
+        xlator_t                 *this                 = NULL;
+        glusterd_conf_t          *conf                 = NULL;
+        glusterd_volinfo_t       *volinfo              = NULL;
+        glusterd_snap_t          *snap                 = NULL;
+
+        this = THIS;
+        GF_ASSERT (this);
+        conf = this->private;
+        GF_ASSERT (conf);
+
+        if (!dict || !op_errstr) {
+                gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+                goto out;
+        }
+
+        ret = dict_get_str (dict, "snapname", &name);
+        if (ret) {
+                gf_log (this->name, GF_LOG_ERROR, "getting the snap "
+                        "name failed (volume: %s)", volinfo->volname);
+                goto out;
+        }
+
+        /*
+          If the snapname is not found that means the failure happened at
+          staging, or in commit, before the snap object is created, in which
+          case there is nothing to cleanup. So set ret to 0.
+        */
+        snap = glusterd_find_snap_by_name (name);
+        if (!snap) {
+                gf_log (this->name, GF_LOG_INFO, "snap %s is not found", name);
+                ret = 0;
+                goto out;
+        }
+
+        ret = glusterd_snap_remove (snap, _gf_true, _gf_true);
+        if (ret) {
+                gf_log (this->name, GF_LOG_ERROR, "removing the snap %s failed",
+                        name);
+                goto out;
+        }
+
+        name = NULL;
+
+        ret = 0;
+
+out:
+
+        return ret;
+}
+
 int32_t
 glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr,
                                  dict_t *rsp_dict)
@@ -3643,6 +3705,14 @@ glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr,
                 }
         }
 
+        snap->snap_status = GD_SNAP_STATUS_IN_USE;
+        ret = glusterd_store_snap (snap);
+        if (ret) {
+                gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+                        "object %s", snap->snapname);
+                goto out;
+        }
+
         ret = 0;
 
 out:
@@ -4569,6 +4639,38 @@ out :
         return ret;
 }
 
+int32_t
+glusterd_snapshot_create_postvalidate (dict_t *dict, int32_t op_ret,
+                                       char **op_errstr, dict_t *rsp_dict)
+{
+        xlator_t        *this           = NULL;
+        glusterd_conf_t *priv           = NULL;
+        int              ret            = -1;
+
+        this = THIS;
+
+        GF_ASSERT (this);
+        GF_ASSERT (dict);
+        GF_ASSERT (rsp_dict);
+
+        priv = this->private;
+        GF_ASSERT (priv);
+
+        if (op_ret) {
+                ret = glusterd_do_snap_cleanup (dict, op_errstr, rsp_dict);
+                if (ret) {
+                        gf_log (this->name, GF_LOG_WARNING, "cleanup operation "
+                                "failed");
+                        goto out;
+                }
+        }
+
+        ret = 0;
+
+out:
+        return ret;
+}
+
 int32_t
 glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
 {
@@ -4796,6 +4898,49 @@ out:
         return ret;
 }
 
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+        int                snap_command          = 0;
+        xlator_t           *this                 = NULL;
+        int                ret                   = -1;
+
+        this = THIS;
+
+        GF_ASSERT (this);
+        GF_ASSERT (dict);
+        GF_ASSERT (rsp_dict); //not sure if this is needed, verify.
+
+        ret = dict_get_int32 (dict, "type", &snap_command);
+        if (ret) {
+                gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+                        "the snapshot command");
+                goto out;
+        }
+
+        switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+                ret = glusterd_snapshot_create_postvalidate (dict, op_ret,
+                                                             op_errstr,
+                                                             rsp_dict);
+                if (ret) {
+                        gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+                                "post-validation failed");
+                        goto out;
+                }
+                break;
+
+        default:
+                gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+                goto out;
+        }
+
+        ret = 0;
+out:
+        return ret;
+}
+
 int
 glusterd_handle_snapshot_fn (rpcsvc_request_t *req)
 {
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c
index ae20b60db..613f03abc 100644
--- a/xlators/mgmt/glusterd/src/glusterd-store.c
+++ b/xlators/mgmt/glusterd/src/glusterd-store.c
@@ -3030,6 +3030,35 @@ out:
         return ret;
 }
 
+int32_t
+glusterd_resolve_snap_bricks (xlator_t  *this, glusterd_snap_t *snap)
+{
+        int32_t                 ret = -1;
+        glusterd_volinfo_t      *volinfo = NULL;
+        glusterd_brickinfo_t    *brickinfo = NULL;
+
+        GF_ASSERT (this);
+        GF_VALIDATE_OR_GOTO (this->name, snap, out);
+
+        list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+                list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+                        ret = glusterd_resolve_brick (brickinfo);
+                        if (ret) {
+                                gf_log (this->name, GF_LOG_ERROR,
+                                        "resolve brick failed in restore");
+                                goto out;
+                        }
+                }
+        }
+
+        ret = 0;
+
+out:
+        gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+
+        return ret;
+}
+
 int
 glusterd_store_update_snap (glusterd_snap_t *snap)
 {
@@ -3152,6 +3181,34 @@ glusterd_store_retrieve_snap (char *snapname)
                 goto out;
         }
 
+        /* Unlike bricks of normal volumes which are resolved at the end of
+           the glusterd restore, the bricks belonging to the snap volumes of
+           each snap should be resolved as part of snapshot restore itself.
+           Because if the snapshot has to be removed, then resolving bricks
+           helps glusterd in understanding what all bricks have its own uuid
+           and killing those bricks.
+        */
+        ret = glusterd_resolve_snap_bricks (this, snap);
+        if (ret)
+                gf_log (this->name, GF_LOG_WARNING, "resolving the snap bricks"
+                        " failed (snap: %s)", snap?snap->snapname:"");
+
+        /* When the snapshot command from cli is received, the on disk and
+           in memory structures for the snapshot are created (with the status)
+           being marked as GD_SNAP_STATUS_INIT. Once the backend snapshot is
+           taken, the status is changed to GD_SNAP_STATUS_IN_USE. If glusterd
+           dies after taking the backend snapshot, but before updating the
+           status, then when glusterd comes up, it should treat that snapshot
+           as a failed snapshot and clean it up.
+        */
+        if (snap->snap_status != GD_SNAP_STATUS_IN_USE) {
+                ret = glusterd_snap_remove (snap, _gf_true, _gf_true);
+                if (ret)
+                        gf_log (this->name, GF_LOG_WARNING, "failed to remove"
+                                " the snapshot %s", snap->snapname);
+                goto out;
+        }
+
         /* TODO: list_add_order can do 'N-square' comparisions and
            is not efficient. Find a better solution to store the snap
            in order */
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 90c624e2f..cd2fb806d 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -5316,6 +5316,7 @@ glusterd_hostname_to_uuid (char *hostname, uuid_t uuid)
                         uuid_copy (uuid, MY_UUID);
                         ret = 0;
                 } else {
+                        ret = 0;
                         goto out;
                 }
         } else {
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index 7e065b77b..e81369ee4 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -947,9 +947,14 @@ int
 glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
 int
 glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+                                dict_t *rsp_dict);
 char *
 glusterd_build_snap_device_path (char *device, char *snapname);
 int32_t
 glusterd_snap_remove (glusterd_snap_t *snap, gf_boolean_t remove_lvm,
                       gf_boolean_t force);
+int32_t
+glusterd_snapshot_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
 #endif
-- 
cgit