1 files changed, 1282 insertions, 530 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index 20142a1cc4b..458bf168ede 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -1,670 +1,1422 @@
 /*
-  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <inttypes.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/resource.h>
 #include <sys/statvfs.h>
 
-#include "globals.h"
-#include "compat.h"
+#include <glusterfs/compat.h>
 #include "protocol-common.h"
-#include "xlator.h"
-#include "logging.h"
-#include "timer.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/timer.h>
 #include "glusterd-mem-types.h"
 #include "glusterd.h"
 #include "glusterd-sm.h"
 #include "glusterd-op-sm.h"
 #include "glusterd-utils.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-messages.h"
 #include "glusterd-store.h"
-#include "run.h"
+#include <glusterfs/run.h>
 #include "glusterd-volgen.h"
+#include "glusterd-messages.h"
 
-#include "syscall.h"
+#include <glusterfs/syscall.h>
 #include "cli1-xdr.h"
 #include "xdr-generic.h"
 
-int32_t
-glusterd3_1_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
-                          int count, void *myframe);
-
-void
-glusterd_rebalance_cmd_attempted_log (int cmd, char *volname)
-{
-        switch (cmd) {
-                case GF_DEFRAG_CMD_START_LAYOUT_FIX:
-                        gf_cmd_log ("Volume rebalance"," on volname: %s "
-                                    "cmd: start fix layout , attempted",
-                                    volname);
-                        gf_log ("glusterd", GF_LOG_INFO, "Received rebalance "
-                                "volume start layout fix on %s", volname);
-                        break;
-                case GF_DEFRAG_CMD_START_FORCE:
-                        gf_cmd_log ("Volume rebalance"," on volname: %s "
-                                    "cmd: start data force attempted",
-                                    volname);
-                        gf_log ("glusterd", GF_LOG_INFO, "Received rebalance "
-                                "volume start migrate data on %s", volname);
-                        break;
-                case GF_DEFRAG_CMD_START:
-                        gf_cmd_log ("Volume rebalance"," on volname: %s "
-                                    "cmd: start, attempted", volname);
-                        gf_log ("glusterd", GF_LOG_INFO, "Received rebalance "
-                                "volume start on %s", volname);
-                        break;
-                case GF_DEFRAG_CMD_STOP:
-                        gf_cmd_log ("Volume rebalance"," on volname: %s "
-                                    "cmd: stop, attempted", volname);
-                        gf_log ("glusterd", GF_LOG_INFO, "Received rebalance "
-                                "volume stop on %s", volname);
-                        break;
-                default:
-                        break;
-        }
-}
-
-void
-glusterd_rebalance_cmd_log (int cmd, char *volname, int status)
-{
-        if (cmd != GF_DEFRAG_CMD_STATUS) {
-                gf_cmd_log ("volume rebalance"," on volname: %s %d %s",
-                            volname, cmd, ((status)?"FAILED":"SUCCESS"));
-        }
-}
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo)                           \
+    do {                                                                       \
+        int32_t _defrag_sockfile_len;                                          \
+        char tmppath[PATH_MAX] = {                                             \
+            0,                                                                 \
+        };                                                                     \
+        _defrag_sockfile_len = snprintf(                                       \
+            tmppath, PATH_MAX,                                                 \
+            DEFAULT_VAR_RUN_DIRECTORY "/gluster-%s-%s-%s.sock", "rebalance",   \
+            volinfo->volname, uuid_utoa(MY_UUID));                             \
+        if ((_defrag_sockfile_len < 0) ||                                      \
+            (_defrag_sockfile_len >= PATH_MAX)) {                              \
+            path[0] = 0;                                                       \
+        } else {                                                               \
+            glusterd_set_socket_filepath(tmppath, path, sizeof(path));         \
+        }                                                                      \
+    } while (0)
 
+int32_t
+glusterd_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe);
 int
-glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
-                                size_t len)
+glusterd_defrag_start_validate(glusterd_volinfo_t *volinfo, char *op_errstr,
+                               size_t len, glusterd_op_t op)
 {
-        int     ret = -1;
-
-        if (glusterd_is_defrag_on (volinfo)) {
-                gf_log ("glusterd", GF_LOG_DEBUG,
-                        "rebalance on volume %s already started",
-                        volinfo->volname);
-                snprintf (op_errstr, len, "Rebalance on %s is already started",
-                          volinfo->volname);
-                goto out;
-        }
-
-        if (glusterd_is_rb_started (volinfo) ||
-            glusterd_is_rb_paused (volinfo)) {
-                gf_log ("glusterd", GF_LOG_DEBUG,
-                        "Rebalance failed as replace brick is in progress on volume %s",
-                        volinfo->volname);
-                snprintf (op_errstr, len, "Rebalance failed as replace brick is in progress on "
-                          "volume %s", volinfo->volname);
-                goto out;
-        }
-        ret = 0;
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    /* Check only if operation is not remove-brick */
+    if ((GD_OP_REMOVE_BRICK != op) && !gd_is_remove_brick_committed(volinfo)) {
+        gf_msg_debug(this->name, 0,
+                     "A remove-brick task on "
+                     "volume %s is not yet committed",
+                     volinfo->volname);
+        snprintf(op_errstr, len,
+                 "A remove-brick task on volume %s is"
+                 " not yet committed. Either commit or stop the "
+                 "remove-brick task.",
+                 volinfo->volname);
+        goto out;
+    }
+
+    if (glusterd_is_defrag_on(volinfo)) {
+        gf_msg_debug(this->name, 0, "rebalance on volume %s already started",
+                     volinfo->volname);
+        snprintf(op_errstr, len, "Rebalance on %s is already started",
+                 volinfo->volname);
+        goto out;
+    }
+
+    ret = 0;
 out:
-        gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
-        return ret;
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
 }
 
 int32_t
-glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
-                        rpc_clnt_event_t event, void *data)
+__glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata,
+                         rpc_clnt_event_t event, void *data)
 {
-        glusterd_volinfo_t      *volinfo = NULL;
-        glusterd_defrag_info_t  *defrag  = NULL;
-        int                     ret      = 0;
-        char                    pidfile[PATH_MAX];
-        glusterd_conf_t        *priv    = NULL;
-
-        priv = THIS->private;
-        if (!priv)
-                return 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_defrag_info_t *defrag = NULL;
+    int ret = 0;
+    char pidfile[PATH_MAX];
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int pid = -1;
+
+    this = THIS;
+    if (!this)
+        return 0;
 
-        volinfo = mydata;
-        if (!volinfo)
-                return 0;
+    priv = this->private;
+    if (!priv)
+        return 0;
 
-        defrag = volinfo->defrag;
-        if (!defrag)
-                return 0;
+    volinfo = mydata;
+    if (!volinfo)
+        return 0;
 
-        if ((event == RPC_CLNT_DISCONNECT) && defrag->connected)
-                volinfo->defrag = NULL;
+    defrag = volinfo->rebal.defrag;
+    if (!defrag)
+        return 0;
 
-        GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+    if ((event == RPC_CLNT_DISCONNECT) && defrag->connected)
+        volinfo->rebal.defrag = NULL;
 
-        switch (event) {
-        case RPC_CLNT_CONNECT:
-        {
-                if (defrag->connected)
-                        return 0;
+    GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
 
-                LOCK (&defrag->lock);
-                {
-                        defrag->connected = 1;
-                }
-                UNLOCK (&defrag->lock);
+    switch (event) {
+        case RPC_CLNT_CONNECT: {
+            if (defrag->connected)
+                return 0;
 
-               gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_CONNECT",
-                        rpc->conn.trans->name);
-               break;
+            LOCK(&defrag->lock);
+            {
+                defrag->connected = 1;
+            }
+            UNLOCK(&defrag->lock);
+
+            gf_msg_debug(this->name, 0, "%s got RPC_CLNT_CONNECT",
+                         rpc->conn.name);
+            break;
         }
 
-        case RPC_CLNT_DISCONNECT:
-        {
-                if (!defrag->connected)
-                        return 0;
+        case RPC_CLNT_DISCONNECT: {
+            if (!defrag->connected)
+                return 0;
 
-                LOCK (&defrag->lock);
-                {
-                        defrag->connected = 0;
-                }
-                UNLOCK (&defrag->lock);
-
-                if (!glusterd_is_service_running (pidfile, NULL)) {
-                        if (volinfo->defrag_status ==
-                                                     GF_DEFRAG_STATUS_STARTED) {
-                                volinfo->defrag_status =
-                                                        GF_DEFRAG_STATUS_FAILED;
-                        } else {
-                                volinfo->defrag_cmd = 0;
-                        }
-                 }
-
-                glusterd_store_perform_node_state_store (volinfo);
-
-                if (defrag->rpc) {
-                        rpc_clnt_unref (defrag->rpc);
-                        defrag->rpc = NULL;
+            LOCK(&defrag->lock);
+            {
+                defrag->connected = 0;
+            }
+            UNLOCK(&defrag->lock);
+
+            if (!gf_is_service_running(pidfile, &pid)) {
+                if (volinfo->rebal.defrag_status == GF_DEFRAG_STATUS_STARTED) {
+                    volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_FAILED;
                 }
-                if (defrag->cbk_fn)
-                        defrag->cbk_fn (volinfo, volinfo->defrag_status);
+            }
 
-                if (defrag)
-                        GF_FREE (defrag);
-                gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_DISCONNECT",
-                        rpc->conn.trans->name);
-                break;
+            glusterd_store_perform_node_state_store(volinfo);
+
+            rpc_clnt_disable(defrag->rpc);
+            glusterd_defrag_rpc_put(defrag);
+            if (defrag->cbk_fn)
+                defrag->cbk_fn(volinfo, volinfo->rebal.defrag_status);
+
+            GF_FREE(defrag);
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_REBALANCE_DISCONNECTED,
+                   "Rebalance process for volume %s has disconnected.",
+                   volinfo->volname);
+            break;
         }
+        case RPC_CLNT_DESTROY:
+            glusterd_volinfo_unref(volinfo);
+            break;
         default:
-                gf_log ("", GF_LOG_TRACE,
-                        "got some other RPC event %d", event);
-                ret = 0;
-                break;
-        }
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            ret = 0;
+            break;
+    }
 
-        return ret;
+    return ret;
 }
 
-int
-glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
-                              size_t len, int cmd, defrag_cbk_fn_t cbk)
+int32_t
+glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata,
+                       rpc_clnt_event_t event, void *data)
 {
-        int                    ret = -1;
-        glusterd_defrag_info_t *defrag =  NULL;
-        runner_t               runner = {0,};
-        glusterd_conf_t        *priv = NULL;
-        char                   defrag_path[PATH_MAX];
-        char                   sockfile[PATH_MAX] = {0,};
-        char                   pidfile[PATH_MAX] = {0,};
-        char                   logfile[PATH_MAX] = {0,};
-        dict_t                 *options = NULL;
-#ifdef DEBUG
-        char                   valgrind_logfile[PATH_MAX] = {0,};
-#endif
-        priv    = THIS->private;
-
-        GF_ASSERT (volinfo);
-        GF_ASSERT (op_errstr);
-
-        ret = glusterd_defrag_start_validate (volinfo, op_errstr, len);
-        if (ret)
-                goto out;
-        if (!volinfo->defrag)
-                volinfo->defrag = GF_CALLOC (1, sizeof (glusterd_defrag_info_t),
-                                             gf_gld_mt_defrag_info);
-        if (!volinfo->defrag)
-                goto out;
-
-        defrag = volinfo->defrag;
-
-        defrag->cmd = cmd;
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_defrag_notify);
+}
 
-        LOCK_INIT (&defrag->lock);
+int
+glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr,
+                             size_t len, int cmd, defrag_cbk_fn_t cbk,
+                             glusterd_op_t op)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+    glusterd_defrag_info_t *defrag = NULL;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    char defrag_path[PATH_MAX];
+    char sockfile[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volname[PATH_MAX] = {
+        0,
+    };
+    char valgrind_logfile[PATH_MAX] = {
+        0,
+    };
+    char msg[1024] = {
+        0,
+    };
+    char *volfileserver = NULL;
+    char *localtime_logging = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", priv, out);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(op_errstr);
+
+    ret = glusterd_defrag_start_validate(volinfo, op_errstr, len, op);
+    if (ret)
+        goto out;
+    if (!volinfo->rebal.defrag)
+        volinfo->rebal.defrag = GF_CALLOC(1, sizeof(*volinfo->rebal.defrag),
+                                          gf_gld_mt_defrag_info);
+    if (!volinfo->rebal.defrag)
+        goto out;
+
+    defrag = volinfo->rebal.defrag;
+
+    defrag->cmd = cmd;
+
+    volinfo->rebal.defrag_cmd = cmd;
+    volinfo->rebal.op = op;
+
+    LOCK_INIT(&defrag->lock);
+
+    volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+    glusterd_volinfo_reset_defrag_stats(volinfo);
+    glusterd_store_perform_node_state_store(volinfo);
+
+    GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv);
+    ret = mkdir_p(defrag_path, 0755, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create "
+               "directory %s",
+               defrag_path);
+        goto out;
+    }
+
+    GLUSTERD_GET_DEFRAG_SOCK_FILE(sockfile, volinfo);
+    GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+    snprintf(logfile, PATH_MAX, "%s/%s-%s.log", priv->logdir, volinfo->volname,
+             "rebalance");
+    runinit(&runner);
+
+    if (this->ctx->cmd_args.vgtool != _gf_none) {
+        snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s-rebalance.log",
+                 priv->logdir, volinfo->volname);
+
+        if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+            runner_add_args(&runner, "valgrind", "--leak-check=full",
+                            "--trace-children=yes", "--track-origins=yes",
+                            NULL);
+        else
+            runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+        runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+    }
+
+    snprintf(volname, sizeof(volname), "rebalance/%s", volinfo->volname);
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+
+    runner_add_args(
+        &runner, SBIN_DIR "/glusterfs", "-s", volfileserver, "--volfile-id",
+        volname, "--xlator-option", "*dht.use-readdirp=yes", "--xlator-option",
+        "*dht.lookup-unhashed=yes", "--xlator-option",
+        "*dht.assert-no-child-down=yes", "--xlator-option",
+        "*dht.readdir-optimize=on", "--process-name", "rebalance", NULL);
+
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.rebalance-cmd=%d", cmd);
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID));
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.commit-hash=%u",
+                     volinfo->rebal.commit_hash);
+    runner_add_arg(&runner, "--socket-file");
+    runner_argprintf(&runner, "%s", sockfile);
+    runner_add_arg(&runner, "--pid-file");
+    runner_argprintf(&runner, "%s", pidfile);
+    runner_add_arg(&runner, "-l");
+    runner_argprintf(&runner, "%s", logfile);
+    if (volinfo->memory_accounting)
+        runner_add_arg(&runner, "--mem-accounting");
+    if (dict_get_strn(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                      SLEN(GLUSTERD_LOCALTIME_LOGGING_KEY),
+                      &localtime_logging) == 0) {
+        if (strcmp(localtime_logging, "enable") == 0)
+            runner_add_arg(&runner, "--localtime-logging");
+    }
+
+    snprintf(msg, sizeof(msg), "Starting the rebalance service for volume %s",
+             volinfo->volname);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+    ret = runner_run_nowait(&runner);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "rebalance command failed");
+        goto out;
+    }
+
+    sleep(5);
+
+    ret = glusterd_rebalance_rpc_create(volinfo);
+
+    // FIXME: this cbk is passed as NULL in all occurrences. May be
+    // we never needed it.
+    if (cbk)
+        defrag->cbk_fn = cbk;
 
-        volinfo->defrag_status = GF_DEFRAG_STATUS_STARTED;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
 
-        volinfo->rebalance_files = 0;
-        volinfo->rebalance_data = 0;
-        volinfo->lookedup_files = 0;
-        volinfo->rebalance_failures = 0;
-        volinfo->rebalance_time = 0;
+int
+glusterd_rebalance_defrag_init(glusterd_volinfo_t *volinfo, defrag_cbk_fn_t cbk)
 
-        volinfo->defrag_cmd = cmd;
-        glusterd_store_perform_node_state_store (volinfo);
+{
+    glusterd_defrag_info_t *defrag = NULL;
+    int ret = -1;
+
+    if (!volinfo->rebal.defrag) {
+        volinfo->rebal.defrag = GF_CALLOC(1, sizeof(*volinfo->rebal.defrag),
+                                          gf_gld_mt_defrag_info);
+    } else {
+        /*
+         * if defrag variable is already initialized,
+         * we skip the initialization.
+         */
+        ret = 0;
+        goto out;
+    }
+
+    if (!volinfo->rebal.defrag)
+        goto out;
+    defrag = volinfo->rebal.defrag;
+
+    defrag->cmd = volinfo->rebal.defrag_cmd;
+    LOCK_INIT(&defrag->lock);
+    if (cbk)
+        defrag->cbk_fn = cbk;
+    ret = 0;
+out:
+    return ret;
+}
 
-        GLUSTERD_GET_DEFRAG_DIR (defrag_path, volinfo, priv);
-        ret = mkdir_p (defrag_path, 0777, _gf_true);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR, "Failed to create "
-                        "directory %s", defrag_path);
-                goto out;
-        }
+int
+glusterd_rebalance_rpc_create(glusterd_volinfo_t *volinfo)
+{
+    dict_t *options = NULL;
+    char sockfile[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    glusterd_defrag_info_t *defrag = volinfo->rebal.defrag;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    // rebalance process is not started
+    if (!defrag)
+        goto out;
+
+    options = dict_new();
+    if (!options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    GLUSTERD_GET_DEFRAG_SOCK_FILE(sockfile, volinfo);
+
+    /* Setting frame-timeout to 10mins (600seconds).
+     * Unix domain sockets ensures that the connection is reliable. The
+     * default timeout of 30mins used for unreliable network connections is
+     * too long for unix domain socket connections.
+     */
+    ret = rpc_transport_unix_options_build(options, sockfile, 600);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_UNIX_OP_BUILD_FAIL,
+               "Unix options build failed");
+        goto out;
+    }
+
+    glusterd_volinfo_ref(volinfo);
+    ret = glusterd_rpc_create(&defrag->rpc, options, glusterd_defrag_notify,
+                              volinfo, _gf_true);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
+               "Glusterd RPC creation failed");
+        goto out;
+    }
+    ret = 0;
+out:
+    if (options)
+        dict_unref(options);
+    return ret;
+}
 
-        GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
-        GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, priv);
-        snprintf (logfile, PATH_MAX, "%s/%s-rebalance.log",
-                    DEFAULT_LOG_FILE_DIRECTORY, volinfo->volname);
-        runinit (&runner);
-#ifdef DEBUG
-        if (priv->valgrind) {
-                snprintf (valgrind_logfile, PATH_MAX,
-                          "%s/valgrind-%s-rebalance.log",
-                          DEFAULT_LOG_FILE_DIRECTORY,
-                          volinfo->volname);
-
-                runner_add_args (&runner, "valgrind", "--leak-check=full",
-                                 "--trace-children=yes", NULL);
-                runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
-        }
-#endif
-
-        runner_add_args (&runner, SBIN_DIR"/glusterfs",
-                         "-s", "localhost", "--volfile-id", volinfo->volname,
-                         "--xlator-option", "*dht.use-readdirp=yes",
-                         "--xlator-option", "*dht.lookup-unhashed=yes",
-                         "--xlator-option", "*dht.assert-no-child-down=yes",
-                         "--xlator-option", "*replicate*.data-self-heal=off",
-                         "--xlator-option",
-                         "*replicate*.metadata-self-heal=off",
-                         "--xlator-option", "*replicate*.entry-self-heal=off",
-                         NULL);
-        runner_add_arg (&runner, "--xlator-option");
-        runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd);
-        runner_add_arg (&runner, "--xlator-option");
-        runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(priv->uuid));
-        runner_add_arg (&runner, "--socket-file");
-        runner_argprintf (&runner, "%s",sockfile);
-        runner_add_arg (&runner, "--pid-file");
-        runner_argprintf (&runner, "%s",pidfile);
-        runner_add_arg (&runner, "-l");
-        runner_argprintf (&runner, logfile);
-        if (volinfo->memory_accounting)
-                runner_add_arg (&runner, "--mem-accounting");
-
-        ret = runner_run_reuse (&runner);
-        if (ret) {
-                runner_log (&runner, "glusterd", GF_LOG_DEBUG, "command failed");
-                runner_end (&runner);
-                goto out;
-        }
+int
+glusterd_rebalance_cmd_validate(int cmd, char *volname,
+                                glusterd_volinfo_t **volinfo, char *op_errstr,
+                                size_t len)
+{
+    int ret = -1;
+
+    if (glusterd_volinfo_find(volname, volinfo)) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Received rebalance on invalid"
+               " volname %s",
+               volname);
+        snprintf(op_errstr, len, "Volume %s does not exist", volname);
+        goto out;
+    }
+    if ((*volinfo)->brick_count <= (*volinfo)->dist_leaf_count) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_DISTRIBUTE,
+               "Volume %s is not a "
+               "distribute type or contains only 1 brick",
+               volname);
+        snprintf(op_errstr, len,
+                 "Volume %s is not a distribute "
+                 "volume or contains only 1 brick.\n"
+                 "Not performing rebalance",
+                 volname);
+        goto out;
+    }
+
+    if ((*volinfo)->status != GLUSTERD_STATUS_STARTED) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_STOPPED,
+               "Received rebalance on stopped"
+               " volname %s",
+               volname);
+        snprintf(op_errstr, len,
+                 "Volume %s needs to "
+                 "be started to perform rebalance",
+                 volname);
+        goto out;
+    }
+
+    ret = 0;
 
-        sleep (5);
-        ret = rpc_clnt_transport_unix_options_build (&options, sockfile);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed");
-                goto out;
-        }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
 
-        ret = glusterd_rpc_create (&defrag->rpc, options,
-                                   glusterd_defrag_notify, volinfo);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed");
-                goto out;
+int
+__glusterd_handle_defrag_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    glusterd_conf_t *priv = NULL;
+    int32_t op = GD_OP_NONE;
+    dict_t *dict = NULL;
+    char *volname = NULL;
+    gf_cli_defrag_type cmd = 0;
+    char msg[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
         }
-
-        if (cbk)
-                defrag->cbk_fn = cbk;
-
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          (int32_t *)&cmd);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get command");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_set_static_bin(dict, "node-uuid", MY_UUID, 16);
+    if (ret)
+        goto out;
+
+    if ((cmd == GF_DEFRAG_CMD_STATUS) || (cmd == GF_DEFRAG_CMD_STOP)) {
+        op = GD_OP_DEFRAG_BRICK_VOLUME;
+    } else
+        op = GD_OP_REBALANCE;
+
+    if (priv->op_version < GD_OP_VERSION_6_0) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than %d. Falling back "
+                     "to op-sm framework.",
+                     GD_OP_VERSION_6_0);
+        ret = glusterd_op_begin(req, op, dict, msg, sizeof(msg));
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases_with_brickop_phase(req, op,
+                                                                      dict);
+    }
 out:
-        gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
-        return ret;
+    if (ret) {
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Operation failed");
+        ret = glusterd_op_send_cli_response(GD_OP_REBALANCE, ret, 0, req, dict,
+                                            msg);
+    }
+
+    free(cli_req.dict.dict_val);  // malloced by xdr
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
 }
 
-
 int
-glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
-                               glusterd_conf_t *priv, int cmd)
+glusterd_handle_defrag_volume(rpcsvc_request_t *req)
 {
-        dict_t                  *options = NULL;
-        char                     sockfile[PATH_MAX] = {0,};
-        int                      ret = -1;
-        glusterd_defrag_info_t  *defrag =  NULL;
-
-        if (!volinfo->defrag)
-                volinfo->defrag = GF_CALLOC (1, sizeof (glusterd_defrag_info_t),
-                                             gf_gld_mt_defrag_info);
-        if (!volinfo->defrag)
-                goto out;
-
-        defrag = volinfo->defrag;
+    return glusterd_big_locked_handler(req, __glusterd_handle_defrag_volume);
+}
 
-        defrag->cmd = cmd;
+static int
+glusterd_brick_validation(dict_t *dict, char *key, data_t *value, void *data)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = data;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_volume_brickinfo_get_by_brick(value->data, volinfo,
+                                                 &brickinfo, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+               "Incorrect brick %s for "
+               "volume %s",
+               value->data, volinfo->volname);
+        return ret;
+    }
+
+    if (!brickinfo->decommissioned) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+               "Incorrect brick %s for "
+               "volume %s",
+               value->data, volinfo->volname);
+        ret = -1;
+        return ret;
+    }
 
-        LOCK_INIT (&defrag->lock);
+    return ret;
+}
 
-        GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
-        ret = rpc_clnt_transport_unix_options_build (&options, sockfile);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed");
-                goto out;
+int
+glusterd_set_rebalance_id_in_rsp_dict(dict_t *req_dict, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int32_t cmd = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(req_dict);
+
+    ret = dict_get_strn(rsp_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(rsp_dict, "rebalance-command",
+                          SLEN("rebalance-command"), &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+
+    /* reblance id is generted in glusterd_mgmt_v3_op_stage_rebalance(), but
+     * rsp_dict is unavailable there. So copying it to rsp_dict from req_dict
+     * here. So that cli can display the rebalance id.*/
+    if ((cmd == GF_DEFRAG_CMD_START) ||
+        (cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX) ||
+        (cmd == GF_DEFRAG_CMD_START_FORCE)) {
+        if (is_origin_glusterd(rsp_dict)) {
+            ret = dict_get_strn(req_dict, GF_REBALANCE_TID_KEY,
+                                SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+            if (ret) {
+                snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                ret = 0;
+            } else {
+                gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 rsp_dict, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+                if (ret) {
+                    snprintf(msg, sizeof(msg),
+                             "Failed to set rebalance id for volume %s",
+                             volname);
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_DICT_SET_FAILED, "%s", msg);
+                }
+            }
         }
-
-        ret = glusterd_rpc_create (&defrag->rpc, options,
-                                   glusterd_defrag_notify, volinfo);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed");
+    }
+
+    /* Set task-id, if available, in rsp_dict for operations other than
+     * start. This is needed when we want rebalance id in xml output
+     */
+    if (cmd == GF_DEFRAG_CMD_STATUS || cmd == GF_DEFRAG_CMD_STOP) {
+        if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+            if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+                ret = glusterd_copy_uuid_to_dict(
+                    volinfo->rebal.rebalance_id, rsp_dict,
+                    GF_REMOVE_BRICK_TID_KEY, SLEN(GF_REMOVE_BRICK_TID_KEY));
+            else
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 rsp_dict, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set task-id for volume %s", volname);
                 goto out;
+            }
         }
-        ret = 0;
+    }
 out:
-        return ret;
+    return ret;
 }
 
 int
-glusterd_rebalance_cmd_validate (int cmd, char *volname,
-                                 glusterd_volinfo_t **volinfo,
-                                 char *op_errstr, size_t len)
+glusterd_mgmt_v3_op_stage_rebalance(dict_t *dict, char **op_errstr)
 {
-        int ret = -1;
-
-        if (glusterd_volinfo_find(volname, volinfo)) {
-                gf_log ("glusterd", GF_LOG_ERROR, "Received rebalance on invalid"
-                        " volname %s", volname);
-                snprintf (op_errstr, len, "Volume %s does not exist",
-                          volname);
+    char *volname = NULL;
+    char *cmd_str = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    char *task_id_str = NULL;
+    xlator_t *this = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+            /* Check if the connected clients are all of version
+             * glusterfs-3.6 and higher. This is needed to prevent some data
+             * loss issues that could occur when older clients are connected
+             * when rebalance is run. This check can be bypassed by using
+             * 'force'
+             */
+            ret = glusterd_check_client_op_version_support(
+                volname, GD_OP_VERSION_3_6_0, NULL);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume %s has one or "
+                                  "more connected clients of a version"
+                                  " lower than GlusterFS-v3.6.0. "
+                                  "Starting rebalance in this state "
+                                  "could lead to data loss.\nPlease "
+                                  "disconnect those clients before "
+                                  "attempting this command again.",
+                                  volname);
                 goto out;
-        }
-        if ((*volinfo)->brick_count <= (*volinfo)->dist_leaf_count) {
-                gf_log ("glusterd", GF_LOG_ERROR, "Volume %s is not a "
-                "distribute type or contains only 1 brick", volname);
-                snprintf (op_errstr, len, "Volume %s is not a distribute "
-                          "volume or contains only 1 brick.\n"
-                          "Not performing rebalance", volname);
+            }
+            /* Fall through */
+        case GF_DEFRAG_CMD_START_FORCE:
+            if (is_origin_glusterd(dict)) {
+                ret = glusterd_generate_and_set_task_id(
+                    dict, GF_REBALANCE_TID_KEY, SLEN(GF_REBALANCE_TID_KEY));
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                           "Failed to generate task-id");
+                    goto out;
+                }
+            } else {
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                    ret = 0;
+                }
+            }
+            ret = glusterd_defrag_start_validate(volinfo, msg, sizeof(msg),
+                                                 GD_OP_REBALANCE);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "defrag start validate "
+                             "failed for volume %s.",
+                             volinfo->volname);
                 goto out;
-        }
+            }
+            break;
+        case GF_DEFRAG_CMD_STATUS:
+        case GF_DEFRAG_CMD_STOP:
 
-        if ((*volinfo)->status != GLUSTERD_STATUS_STARTED) {
-                gf_log ("glusterd", GF_LOG_ERROR, "Received rebalance on stopped"
-                        " volname %s", volname);
-                snprintf (op_errstr, len, "Volume %s needs to "
-                          "be started to perform rebalance", volname);
+            ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get "
+                       "command string");
+                ret = -1;
                 goto out;
-        }
+            }
+            if ((strstr(cmd_str, "rebalance") != NULL) &&
+                (volinfo->rebal.op != GD_OP_REBALANCE)) {
+                snprintf(msg, sizeof(msg),
+                         "Rebalance not started "
+                         "for volume %s.",
+                         volinfo->volname);
+                ret = -1;
+                goto out;
+            }
+
+            if (strstr(cmd_str, "remove-brick") != NULL) {
+                if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+                    snprintf(msg, sizeof(msg),
+                             "remove-brick not "
+                             "started for volume %s.",
+                             volinfo->volname);
+                    ret = -1;
+                    goto out;
+                }
 
-        ret = 0;
+                /* For remove-brick status/stop command check whether
+                 * given input brick is part of volume or not.*/
+
+                ret = dict_foreach_fnmatch(dict, "brick*",
+                                           glusterd_brick_validation, volinfo);
+                if (ret == -1) {
+                    snprintf(msg, sizeof(msg),
+                             "Incorrect brick"
+                             " for volume %s",
+                             volinfo->volname);
+                    goto out;
+                }
+            }
+            break;
+
+        default:
+            break;
+    }
 
+    ret = 0;
 out:
-        gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
-        return ret;
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
 }
 
 int
-glusterd_handle_defrag_volume (rpcsvc_request_t *req)
+glusterd_mgmt_v3_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
 {
-        int32_t                 ret     = -1;
-        gf_cli_req              cli_req = {{0,}};
-        glusterd_conf_t        *priv    = NULL;
-        dict_t                 *dict    = NULL;
-        char                   *volname = NULL;
-        gf_cli_defrag_type      cmd     = 0;
-
-        GF_ASSERT (req);
-
-        priv = THIS->private;
+    char *volname = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+    gf_boolean_t volfile_update = _gf_false;
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+    uint32_t commit_hash;
+    int32_t is_force = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not given");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "command not given");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd validate failed");
+        goto out;
+    }
+
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+        case GF_DEFRAG_CMD_START_FORCE:
 
-        if (!xdr_to_generic (req->msg[0], &cli_req,
-                             (xdrproc_t)xdr_gf_cli_req)) {
-                //failed to decode msg;
-                req->rpc_err = GARBAGE_ARGS;
-                goto out;
-        }
-       if (cli_req.dict.dict_len) {
-                /* Unserialize the dictionary */
-                dict  = dict_new ();
-
-                ret = dict_unserialize (cli_req.dict.dict_val,
-                                        cli_req.dict.dict_len,
-                                        &dict);
-                if (ret < 0) {
-                        gf_log ("glusterd", GF_LOG_ERROR,
-                                "failed to "
-                                "unserialize req-buffer to dictionary");
-                        goto out;
+            ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+            if (ret)
+                is_force = 0;
+            if (!is_force) {
+                /* Reset defrag status to 'NOT STARTED' whenever a
+                 * remove-brick/rebalance command is issued to remove
+                 * stale information from previous run.
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
                 }
-        }
+                if (!gd_should_i_start_rebalance(volinfo)) {
+                    /* Store the rebalance-id and rebalance command
+                     * even if the peer isn't starting a rebalance
+                     * process. On peers where a rebalance process
+                     * is started, glusterd_handle_defrag_start
+                     * performs the storing.
+                     * Storing this is needed for having
+                     * 'volume status' work correctly.
+                     */
+                    glusterd_store_perform_node_state_store(volinfo);
+                    break;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_handle_defrag_start(volinfo, msg, sizeof(msg),
+                                                   cmd, NULL, GD_OP_REBALANCE);
+                break;
+            } else {
+                /* Reset defrag status to 'STARTED' so that the
+                 * pid is checked and restarted accordingly.
+                 * If the pid is not running it executes the
+                 * "NOT_STARTED" case and restarts the process
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+                volinfo->rebal.defrag_cmd = cmd;
+                volinfo->rebal.op = GD_OP_REBALANCE;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_restart_rebalance_for_volume(volinfo);
+                break;
+            }
+        case GF_DEFRAG_CMD_STOP:
+            /* Clear task-id only on explicitly stopping rebalance.
+             * Also clear the stored operation, so it doesn't cause trouble
+             * with future rebalance/remove-brick starts
+             */
+            gf_uuid_clear(volinfo->rebal.rebalance_id);
+            volinfo->rebal.op = GD_OP_NONE;
+
+            /* Fall back to the old volume file in case of decommission*/
+            cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks,
+                                         brick_list)
+            {
+                if (!brickinfo->decommissioned)
+                    continue;
+                brickinfo->decommissioned = 0;
+                volfile_update = _gf_true;
+            }
+
+            if (volfile_update == _gf_false) {
+                ret = 0;
+                break;
+            }
 
-        ret = dict_get_str (dict, "volname", &volname);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR,
-                        "Failed to get volname");
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_VOLFILE_CREATE_FAIL, "failed to create volfiles");
                 goto out;
-        }
+            }
 
-        ret = dict_get_int32 (dict, "rebalance-command", (int32_t*)&cmd);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_ERROR,
-                        "Failed to get command");
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "failed to store volinfo");
                 goto out;
-        }
-
-        glusterd_rebalance_cmd_attempted_log (cmd, volname);
+            }
 
-        ret = dict_set_static_bin (dict, "node-uuid", priv->uuid, 16);
-        if (ret)
-                goto out;
+            ret = 0;
+            break;
 
-        if ((cmd == GF_DEFRAG_CMD_STATUS) ||
-              (cmd == GF_DEFRAG_CMD_STOP)) {
-                ret = glusterd_op_begin (req, GD_OP_DEFRAG_BRICK_VOLUME,
-                                                  dict);
-        } else
-                ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict);
+        case GF_DEFRAG_CMD_STATUS:
+            break;
+        default:
+            break;
+    }
 
 out:
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
 
-        glusterd_friend_sm ();
-        glusterd_op_sm ();
-
-        if (ret) {
-                if (dict)
-                        dict_unref (dict);
-                ret = glusterd_op_send_cli_response (GD_OP_REBALANCE, ret, 0, req,
-                                                     NULL, "operation failed");
-        }
-
-        if (cli_req.dict.dict_val)
-                free (cli_req.dict.dict_val);//malloced by xdr
-
-        return 0;
+    return ret;
 }
 
-
 int
-glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr)
+glusterd_op_stage_rebalance(dict_t *dict, char **op_errstr)
 {
-        char *volname = NULL;
-        int ret = 0;
-        int32_t cmd = 0;
-        char msg[2048] = {0};
-        glusterd_volinfo_t  *volinfo = NULL;
-
-        ret = dict_get_str (dict, "volname", &volname);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "volname not found");
-                goto out;
-        }
-        ret = dict_get_int32 (dict, "rebalance-command", &cmd);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "cmd not found");
-                goto out;
-        }
-
-        ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
-                                               msg, sizeof (msg));
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "failed to validate");
-                goto out;
-        }
-        switch (cmd) {
+    char *volname = NULL;
+    char *cmd_str = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    char *task_id_str = NULL;
+    dict_t *op_ctx = NULL;
+    xlator_t *this = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+    switch (cmd) {
         case GF_DEFRAG_CMD_START:
         case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+            /* Check if the connected clients are all of version
+             * glusterfs-3.6 and higher. This is needed to prevent some data
+             * loss issues that could occur when older clients are connected
+             * when rebalance is run. This check can be bypassed by using
+             * 'force'
+             */
+            ret = glusterd_check_client_op_version_support(
+                volname, GD_OP_VERSION_3_6_0, NULL);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume %s has one or "
+                                  "more connected clients of a version"
+                                  " lower than GlusterFS-v3.6.0. "
+                                  "Starting rebalance in this state "
+                                  "could lead to data loss.\nPlease "
+                                  "disconnect those clients before "
+                                  "attempting this command again.",
+                                  volname);
+                goto out;
+            }
+            /* Fall through */
         case GF_DEFRAG_CMD_START_FORCE:
-                ret = glusterd_defrag_start_validate (volinfo,
-                                                      msg, sizeof (msg));
+            if (is_origin_glusterd(dict)) {
+                op_ctx = glusterd_op_get_ctx();
+                if (!op_ctx) {
+                    ret = -1;
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                           "Failed to get op_ctx");
+                    goto out;
+                }
+
+                ret = glusterd_generate_and_set_task_id(
+                    op_ctx, GF_REBALANCE_TID_KEY, SLEN(GF_REBALANCE_TID_KEY));
                 if (ret) {
-                        gf_log (THIS->name, GF_LOG_DEBUG,
-                                "start validate failed");
-                        goto out;
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                           "Failed to generate task-id");
+                    goto out;
                 }
-                break;
+            } else {
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                    ret = 0;
+                }
+            }
+            ret = glusterd_defrag_start_validate(volinfo, msg, sizeof(msg),
+                                                 GD_OP_REBALANCE);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "defrag start validate "
+                             "failed for volume %s.",
+                             volinfo->volname);
+                goto out;
+            }
+            break;
         case GF_DEFRAG_CMD_STATUS:
         case GF_DEFRAG_CMD_STOP:
-                break;
+
+            ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get "
+                       "command string");
+                ret = -1;
+                goto out;
+            }
+            if ((strstr(cmd_str, "rebalance") != NULL) &&
+                (volinfo->rebal.op != GD_OP_REBALANCE)) {
+                snprintf(msg, sizeof(msg),
+                         "Rebalance not started "
+                         "for volume %s.",
+                         volinfo->volname);
+                ret = -1;
+                goto out;
+            }
+
+            if (strstr(cmd_str, "remove-brick") != NULL) {
+                if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+                    snprintf(msg, sizeof(msg),
+                             "remove-brick not "
+                             "started for volume %s.",
+                             volinfo->volname);
+                    ret = -1;
+                    goto out;
+                }
+
+                /* For remove-brick status/stop command check whether
+                 * given input brick is part of volume or not.*/
+
+                ret = dict_foreach_fnmatch(dict, "brick*",
+                                           glusterd_brick_validation, volinfo);
+                if (ret == -1) {
+                    snprintf(msg, sizeof(msg),
+                             "Incorrect brick"
+                             " for volume %s",
+                             volinfo->volname);
+                    goto out;
+                }
+            }
+            break;
+
         default:
-                break;
-        }
+            break;
+    }
 
-        ret = 0;
+    ret = 0;
 out:
-        if (ret && op_errstr && msg[0])
-                *op_errstr = gf_strdup (msg);
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
 
-        return ret;
+    return ret;
 }
 
-
 int
-glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+glusterd_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
 {
-        char               *volname   = NULL;
-        int                 ret       = 0;
-        int32_t             cmd       = 0;
-        char                msg[2048] = {0};
-        glusterd_volinfo_t *volinfo   = NULL;
-        glusterd_conf_t    *priv      = NULL;
-        glusterd_brickinfo_t *brickinfo = NULL;
-        glusterd_brickinfo_t *tmp      = NULL;
-        gf_boolean_t        volfile_update = _gf_false;
-
-        priv = THIS->private;
-
-        ret = dict_get_str (dict, "volname", &volname);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "volname not given");
+    char *volname = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+    gf_boolean_t volfile_update = _gf_false;
+    char *task_id_str = NULL;
+    dict_t *ctx = NULL;
+    xlator_t *this = NULL;
+    uint32_t commit_hash;
+    int32_t is_force = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not given");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "command not given");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd validate failed");
+        goto out;
+    }
+
+    /* Set task-id, if available, in op_ctx dict for operations other than
+     * start
+     */
+    if (cmd == GF_DEFRAG_CMD_STATUS || cmd == GF_DEFRAG_CMD_STOP) {
+        if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+            ctx = glusterd_op_get_ctx();
+            if (!ctx) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                       "Failed to get op_ctx");
+                ret = -1;
                 goto out;
-        }
-
-        ret = dict_get_int32 (dict, "rebalance-command", &cmd);
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "command not given");
-                goto out;
-        }
-
-        ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
-                                               msg, sizeof (msg));
-        if (ret) {
-                gf_log (THIS->name, GF_LOG_DEBUG, "cmd validate failed");
+            }
+
+            if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 ctx, GF_REMOVE_BRICK_TID_KEY,
+                                                 SLEN(GF_REMOVE_BRICK_TID_KEY));
+            else
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 ctx, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                       "Failed to set task-id");
                 goto out;
+            }
         }
+    }
 
-        switch (cmd) {
+    switch (cmd) {
         case GF_DEFRAG_CMD_START:
         case GF_DEFRAG_CMD_START_LAYOUT_FIX:
         case GF_DEFRAG_CMD_START_FORCE:
-                ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg),
-                                                    cmd, NULL);
-                 break;
-        case GF_DEFRAG_CMD_STOP:
-                /* Fall back to the old volume file in case of decommission*/
-                list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
-                                          brick_list) {
-                        if (!brickinfo->decommissioned)
-                                continue;
-                        brickinfo->decommissioned = 0;
-                        volfile_update = _gf_true;
-                }
 
-                if (volfile_update == _gf_false) {
-                        ret = 0;
-                        break;
-                }
-
-                ret = glusterd_create_volfiles_and_notify_services (volinfo);
+            ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+            if (ret)
+                is_force = 0;
+            if (!is_force) {
+                /* Reset defrag status to 'NOT STARTED' whenever a
+                 * remove-brick/rebalance command is issued to remove
+                 * stale information from previous run.
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
                 if (ret) {
-                        gf_log (THIS->name, GF_LOG_WARNING,
-                                "failed to create volfiles");
-                        goto out;
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
                 }
-
-                ret = glusterd_store_volinfo (volinfo,
-                                             GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+                if (!gd_should_i_start_rebalance(volinfo)) {
+                    /* Store the rebalance-id and rebalance command
+                     * even if the peer isn't starting a rebalance
+                     * process. On peers where a rebalance process
+                     * is started, glusterd_handle_defrag_start
+                     * performs the storing.
+                     * Storing this is needed for having
+                     * 'volume status' work correctly.
+                     */
+                    glusterd_store_perform_node_state_store(volinfo);
+                    break;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_handle_defrag_start(volinfo, msg, sizeof(msg),
+                                                   cmd, NULL, GD_OP_REBALANCE);
+                break;
+            } else {
+                /* Reset defrag status to 'STARTED' so that the
+                 * pid is checked and restarted accordingly.
+                 * If the pid is not running it executes the
+                 * "NOT_STARTED" case and restarts the process
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+                volinfo->rebal.defrag_cmd = cmd;
+                volinfo->rebal.op = GD_OP_REBALANCE;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
                 if (ret) {
-                        gf_log (THIS->name, GF_LOG_WARNING,
-                                "failed to store volinfo");
-                        goto out;
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
                 }
-
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_restart_rebalance_for_volume(volinfo);
+                break;
+            }
+        case GF_DEFRAG_CMD_STOP:
+            /* Clear task-id only on explicitly stopping rebalance.
+             * Also clear the stored operation, so it doesn't cause trouble
+             * with future rebalance/remove-brick starts
+             */
+            gf_uuid_clear(volinfo->rebal.rebalance_id);
+            volinfo->rebal.op = GD_OP_NONE;
+
+            /* Fall back to the old volume file in case of decommission*/
+            cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks,
+                                         brick_list)
+            {
+                if (!brickinfo->decommissioned)
+                    continue;
+                brickinfo->decommissioned = 0;
+                volfile_update = _gf_true;
+            }
+
+            if (volfile_update == _gf_false) {
                 ret = 0;
                 break;
+            }
+
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_VOLFILE_CREATE_FAIL, "failed to create volfiles");
+                goto out;
+            }
+
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "failed to store volinfo");
+                goto out;
+            }
+
+            ret = 0;
+            break;
 
         case GF_DEFRAG_CMD_STATUS:
-                break;
+            break;
         default:
-                break;
-        }
-
-        glusterd_rebalance_cmd_log (cmd, volname, ret);
+            break;
+    }
 
 out:
-        if (ret && op_errstr && msg[0])
-                *op_errstr = gf_strdup (msg);
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
 
-        return ret;
+    return ret;
 }
 
 int32_t
-glusterd_defrag_event_notify_handle (dict_t *dict)
+glusterd_defrag_event_notify_handle(dict_t *dict)
 {
-        glusterd_volinfo_t      *volinfo = NULL;
-        char                    *volname = NULL;
-        int32_t                  ret     = -1;
-
-        ret = dict_get_str (dict, "volname", &volname);
-        if (ret) {
-                gf_log ("", GF_LOG_ERROR, "Failed to get volname");
-                return ret;
-        }
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    char *volname_ptr = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volname");
+        return ret;
+    }
+
+    volname_ptr = strstr(volname, "rebalance/");
+    if (volname_ptr) {
+        volname_ptr = strchr(volname_ptr, '/');
+        volname = volname_ptr + 1;
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_REBALANCE_PFX_IN_VOLNAME,
+               "volname received (%s) is not prefixed with rebalance.",
+               volname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo for %s", volname);
+        return ret;
+    }
 
-        ret = glusterd_volinfo_find (volname, &volinfo);
-        if (ret) {
-                gf_log ("", GF_LOG_ERROR, "Failed to get volinfo for %s"
-                        , volname);
-                return ret;
-        }
+    ret = glusterd_defrag_volume_status_update(volinfo, dict, 0);
 
-        ret = glusterd_defrag_volume_status_update (volinfo, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DEFRAG_STATUS_UPDATE_FAIL,
+               "Failed to update status");
+        gf_event(EVENT_REBALANCE_STATUS_UPDATE_FAILED, "volume=%s",
+                 volinfo->volname);
+    }
 
-        if (ret)
-                gf_log ("", GF_LOG_ERROR, "Failed to update status");
-        return ret;
+out:
+    return ret;
 }